From ce35cd30206b352f4aea3eb99953e08b3201e72a Mon Sep 17 00:00:00 2001 From: YaphetKG <45075777+YaphetKG@users.noreply.github.com> Date: Wed, 13 Dec 2023 14:15:49 -0500 Subject: [PATCH 01/46] fix response from nn --- src/dug/core/annotate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dug/core/annotate.py b/src/dug/core/annotate.py index bbf766b..79686de 100644 --- a/src/dug/core/annotate.py +++ b/src/dug/core/annotate.py @@ -429,7 +429,7 @@ def make_request(self, curie: str, http_session: Session): def handle_response(self, curie: str, raw_synonyms: List[dict]) -> List[str]: # Return curie synonyms - return raw_synonyms.get(curie, []) + return raw_synonyms.get(curie, {}).get('names', []) From 149be9fb7a6c62f94c95c476c4ca839437905727 Mon Sep 17 00:00:00 2001 From: YaphetKG <45075777+YaphetKG@users.noreply.github.com> Date: Wed, 13 Dec 2023 14:58:12 -0500 Subject: [PATCH 02/46] norm returned values from make_request --- src/dug/core/annotate.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/dug/core/annotate.py b/src/dug/core/annotate.py index 79686de..59dd379 100644 --- a/src/dug/core/annotate.py +++ b/src/dug/core/annotate.py @@ -418,14 +418,14 @@ def make_request(self, curie: str, http_session: Session): response = http_session.post(url, json=payload) if str(response.status_code).startswith('4'): logger.error(f"No synonyms returned for: `{curie}`. Validation error: {response.text}") - return {curie: []} + return {curie: {"names": []}} if str(response.status_code).startswith('5'): logger.error(f"No synonyms returned for: `{curie}`. Internal server error from {self.url}. Error: {response.text}") - return {curie: []} + return {curie: {"names": []}} return response.json() except json.decoder.JSONDecodeError as e: logger.error(f"Json parse error for response from `{url}`. Exception: {str(e)}") - return {curie: []} + return {curie: {"names": []}} def handle_response(self, curie: str, raw_synonyms: List[dict]) -> List[str]: # Return curie synonyms From 24b34e9331e237e55540b58b5aeee60c3ed5cd8e Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Tue, 19 Dec 2023 13:34:37 -0500 Subject: [PATCH 03/46] correcting jsonable to recursively serialize sub objects --- src/dug/core/parsers/_base.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/dug/core/parsers/_base.py b/src/dug/core/parsers/_base.py index acfc5bb..57184ae 100644 --- a/src/dug/core/parsers/_base.py +++ b/src/dug/core/parsers/_base.py @@ -1,5 +1,6 @@ import json from typing import Union, Callable, Any, Iterable +import copy from dug.core.loaders import InputFile @@ -29,7 +30,11 @@ def add_concept(self, concept): self.concepts[concept.id] = concept def jsonable(self): - return self.__dict__ + copy_dict = copy(self.__dict__) + concepts = {k: v.jsonable() for k, v in concepts.items} + copy_dict['concepts'] = concepts + return copy_dict + def get_searchable_dict(self): # Translate DugElement to ES-style dict @@ -132,7 +137,10 @@ def get_searchable_dict(self): return es_conc def jsonable(self): - return self.__dict__ + copy_dict = copy(self.__dict__) + identifiers = {k: v.jsonable() for k, v in self.identifiers.items()} + copy_dict['identifiers'] = identifiers + return copy_dict def __str__(self): return json.dumps(self.__dict__, indent=2, default=utils.complex_handler) From 785b789ee698c5697112986056423b574dd8e23f Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Tue, 19 Dec 2023 13:52:22 -0500 Subject: [PATCH 04/46] correcting jsonable to recursively serialize sub objects --- src/dug/core/parsers/_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/dug/core/parsers/_base.py b/src/dug/core/parsers/_base.py index 57184ae..a67c6cd 100644 --- a/src/dug/core/parsers/_base.py +++ b/src/dug/core/parsers/_base.py @@ -30,7 +30,7 @@ def add_concept(self, concept): self.concepts[concept.id] = concept def jsonable(self): - copy_dict = copy(self.__dict__) + copy_dict = copy.deepcopy(self.__dict__) concepts = {k: v.jsonable() for k, v in concepts.items} copy_dict['concepts'] = concepts return copy_dict @@ -137,7 +137,7 @@ def get_searchable_dict(self): return es_conc def jsonable(self): - copy_dict = copy(self.__dict__) + copy_dict = copy.deepcopy(self.__dict__) identifiers = {k: v.jsonable() for k, v in self.identifiers.items()} copy_dict['identifiers'] = identifiers return copy_dict From fec990a8278df2b4087bb2bc35309ff04bb880ac Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Tue, 19 Dec 2023 13:58:20 -0500 Subject: [PATCH 05/46] correcting jsonable to recursively serialize sub objects --- src/dug/core/parsers/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dug/core/parsers/_base.py b/src/dug/core/parsers/_base.py index a67c6cd..46083e7 100644 --- a/src/dug/core/parsers/_base.py +++ b/src/dug/core/parsers/_base.py @@ -31,7 +31,7 @@ def add_concept(self, concept): def jsonable(self): copy_dict = copy.deepcopy(self.__dict__) - concepts = {k: v.jsonable() for k, v in concepts.items} + concepts = {k: v.jsonable() for k, v in self.concepts.items} copy_dict['concepts'] = concepts return copy_dict From edfff4f9eb9e42e7fce4f6157782546a8b5b608b Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Tue, 19 Dec 2023 14:10:21 -0500 Subject: [PATCH 06/46] correcting jsonable to recursively serialize sub objects --- src/dug/core/parsers/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dug/core/parsers/_base.py b/src/dug/core/parsers/_base.py index 46083e7..231608e 100644 --- a/src/dug/core/parsers/_base.py +++ b/src/dug/core/parsers/_base.py @@ -31,7 +31,7 @@ def add_concept(self, concept): def jsonable(self): copy_dict = copy.deepcopy(self.__dict__) - concepts = {k: v.jsonable() for k, v in self.concepts.items} + concepts = {k: v.jsonable() for k, v in self.concepts.items()} copy_dict['concepts'] = concepts return copy_dict From f01844ac42e160748bb67812b570e3e7c8bead65 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Tue, 19 Dec 2023 17:23:33 -0500 Subject: [PATCH 07/46] parameterize all identifier inner vars; --- src/dug/core/annotate.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/dug/core/annotate.py b/src/dug/core/annotate.py index 59dd379..fa566dd 100644 --- a/src/dug/core/annotate.py +++ b/src/dug/core/annotate.py @@ -18,7 +18,7 @@ class Identifier: - def __init__(self, id, label, types=None, search_text="", description=""): + def __init__(self, id, label, types=None, search_text=[], description="", equivalent_identifiers=[], synonyms=[], purl = []): self.id = id self.label = label self.description = description @@ -26,9 +26,9 @@ def __init__(self, id, label, types=None, search_text="", description=""): types = [] self.types = types self.search_text = [search_text] if search_text else [] - self.equivalent_identifiers = [] - self.synonyms = [] - self.purl = "" + self.equivalent_identifiers = equivalent_identifiers + self.synonyms = synonyms + self.purl = purl @property def id_type(self): From c70940fba144d92eb5732c46470ed4ba116be080 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Tue, 19 Dec 2023 17:44:54 -0500 Subject: [PATCH 08/46] parameterize everything for init from json form --- src/dug/core/parsers/_base.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/dug/core/parsers/_base.py b/src/dug/core/parsers/_base.py index 231608e..b42d4f1 100644 --- a/src/dug/core/parsers/_base.py +++ b/src/dug/core/parsers/_base.py @@ -11,7 +11,7 @@ class DugElement: # Basic class for holding information for an object you want to make searchable via Dug # Could be a DbGaP variable, DICOM image, App, or really anything # Optionally can hold information pertaining to a containing collection (e.g. dbgap study or dicom image series) - def __init__(self, elem_id, name, desc, elem_type, collection_id="", collection_name="", collection_desc="", action="", collection_action=""): + def __init__(self, elem_id, name, desc, elem_type, collection_id="", collection_name="", collection_desc="", action="", collection_action="", concepts={}, ml_ready_desc="", search_terms=[], optional_terms=[]): self.id = elem_id self.name = name self.description = desc @@ -21,10 +21,10 @@ def __init__(self, elem_id, name, desc, elem_type, collection_id="", collection_ self.collection_desc = collection_desc self.action = action self.collection_action = collection_action - self.concepts = {} - self.ml_ready_desc = desc - self.search_terms = [] - self.optional_terms = [] + self.concepts = concepts + self.ml_ready_desc = ml_ready_desc or desc + self.search_terms = search_terms + self.optional_terms = optional_terms def add_concept(self, concept): self.concepts[concept.id] = concept @@ -78,17 +78,17 @@ def __str__(self): class DugConcept: # Basic class for holding information about concepts that are used to organize elements # All Concepts map to at least one element - def __init__(self, concept_id, name, desc, concept_type): - self.id = concept_id + def __init__(self, concept_id, name, desc, concept_type, id="" , description="", type="", concept_action="", identifiers = {}, kg_answers={}, search_terms = [] , optional_terms=[], ml_ready_desc=""): + self.id = concept_id or id self.name = name - self.description = desc - self.type = concept_type + self.description = desc or description + self.type = concept_type or type self.concept_action = "" - self.identifiers = {} - self.kg_answers = {} - self.search_terms = [] - self.optional_terms = [] - self.ml_ready_desc = desc + self.identifiers = identifiers + self.kg_answers = kg_answers + self.search_terms = search_terms + self.optional_terms = optional_terms + self.ml_ready_desc = desc or ml_ready_desc def add_identifier(self, ident): if ident.id in self.identifiers: From a95bd2e9ba9f9ab014c26bcdad8ac151ec53ee50 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Tue, 19 Dec 2023 17:51:11 -0500 Subject: [PATCH 09/46] probably not a revealation but making everything optional in initialization --- src/dug/core/parsers/_base.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/dug/core/parsers/_base.py b/src/dug/core/parsers/_base.py index b42d4f1..16e88a4 100644 --- a/src/dug/core/parsers/_base.py +++ b/src/dug/core/parsers/_base.py @@ -11,11 +11,11 @@ class DugElement: # Basic class for holding information for an object you want to make searchable via Dug # Could be a DbGaP variable, DICOM image, App, or really anything # Optionally can hold information pertaining to a containing collection (e.g. dbgap study or dicom image series) - def __init__(self, elem_id, name, desc, elem_type, collection_id="", collection_name="", collection_desc="", action="", collection_action="", concepts={}, ml_ready_desc="", search_terms=[], optional_terms=[]): + def __init__(self, elem_id, name, desc="", elem_type="", type= "", collection_id="", collection_name="", collection_desc="", action="", collection_action="", concepts={}, ml_ready_desc="", search_terms=[], optional_terms=[]): self.id = elem_id self.name = name self.description = desc - self.type = elem_type + self.type = elem_type or type self.collection_id = collection_id self.collection_name = collection_name self.collection_desc = collection_desc @@ -78,7 +78,7 @@ def __str__(self): class DugConcept: # Basic class for holding information about concepts that are used to organize elements # All Concepts map to at least one element - def __init__(self, concept_id, name, desc, concept_type, id="" , description="", type="", concept_action="", identifiers = {}, kg_answers={}, search_terms = [] , optional_terms=[], ml_ready_desc=""): + def __init__(self, concept_id="", name="", desc="", concept_type="", id="" , description="", type="", concept_action="", identifiers = {}, kg_answers={}, search_terms = [] , optional_terms=[], ml_ready_desc=""): self.id = concept_id or id self.name = name self.description = desc or description From f3fca0f8b0d2eccda36985d5d8457b0529bf0340 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Tue, 19 Dec 2023 17:51:29 -0500 Subject: [PATCH 10/46] probably not a revealation but making everything optional in initialization --- src/dug/core/parsers/_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/dug/core/parsers/_base.py b/src/dug/core/parsers/_base.py index 16e88a4..98e5fe9 100644 --- a/src/dug/core/parsers/_base.py +++ b/src/dug/core/parsers/_base.py @@ -11,8 +11,8 @@ class DugElement: # Basic class for holding information for an object you want to make searchable via Dug # Could be a DbGaP variable, DICOM image, App, or really anything # Optionally can hold information pertaining to a containing collection (e.g. dbgap study or dicom image series) - def __init__(self, elem_id, name, desc="", elem_type="", type= "", collection_id="", collection_name="", collection_desc="", action="", collection_action="", concepts={}, ml_ready_desc="", search_terms=[], optional_terms=[]): - self.id = elem_id + def __init__(self, elem_id="", name="", desc="", elem_type="", id="", type= "", collection_id="", collection_name="", collection_desc="", action="", collection_action="", concepts={}, ml_ready_desc="", search_terms=[], optional_terms=[]): + self.id = elem_id or id self.name = name self.description = desc self.type = elem_type or type From 1bd901f8ab60672af7dea27bcd1ad88c0232f0e0 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Tue, 19 Dec 2023 18:01:07 -0500 Subject: [PATCH 11/46] missed description --- src/dug/core/parsers/_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/dug/core/parsers/_base.py b/src/dug/core/parsers/_base.py index 98e5fe9..da6e726 100644 --- a/src/dug/core/parsers/_base.py +++ b/src/dug/core/parsers/_base.py @@ -11,10 +11,10 @@ class DugElement: # Basic class for holding information for an object you want to make searchable via Dug # Could be a DbGaP variable, DICOM image, App, or really anything # Optionally can hold information pertaining to a containing collection (e.g. dbgap study or dicom image series) - def __init__(self, elem_id="", name="", desc="", elem_type="", id="", type= "", collection_id="", collection_name="", collection_desc="", action="", collection_action="", concepts={}, ml_ready_desc="", search_terms=[], optional_terms=[]): + def __init__(self, elem_id="", name="", desc="", description="", elem_type="", id="", type= "", collection_id="", collection_name="", collection_desc="", action="", collection_action="", concepts={}, ml_ready_desc="", search_terms=[], optional_terms=[]): self.id = elem_id or id self.name = name - self.description = desc + self.description = desc or description self.type = elem_type or type self.collection_id = collection_id self.collection_name = collection_name From 3f4e3347fe19abdd13ae96d2cfc88d45c75f16ad Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Tue, 19 Dec 2023 18:49:55 -0500 Subject: [PATCH 12/46] normalize search test in identifier --- src/dug/core/annotate.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/dug/core/annotate.py b/src/dug/core/annotate.py index fa566dd..291cf2f 100644 --- a/src/dug/core/annotate.py +++ b/src/dug/core/annotate.py @@ -25,7 +25,10 @@ def __init__(self, id, label, types=None, search_text=[], description="", equiva if types is None: types = [] self.types = types - self.search_text = [search_text] if search_text else [] + if isinstance(search_text, str): + self.search_text = [search_text] if search_text else [] + elif isinstance(search_text, list): + self.search_text = search_text self.equivalent_identifiers = equivalent_identifiers self.synonyms = synonyms self.purl = purl From 227ad4a5bea5460346a0b40e143b832e9842f223 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Tue, 19 Dec 2023 20:31:01 -0500 Subject: [PATCH 13/46] https://github.com/TranslatorSRI/NameResolution/issues/129 --- src/dug/core/annotate.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/dug/core/annotate.py b/src/dug/core/annotate.py index 291cf2f..d3066a1 100644 --- a/src/dug/core/annotate.py +++ b/src/dug/core/annotate.py @@ -428,7 +428,9 @@ def make_request(self, curie: str, http_session: Session): return response.json() except json.decoder.JSONDecodeError as e: logger.error(f"Json parse error for response from `{url}`. Exception: {str(e)}") - return {curie: {"names": []}} + except requests.exceptions.ConnectionError as e: + logger.error(f'connection reset') + return {curie: {"names": []}} def handle_response(self, curie: str, raw_synonyms: List[dict]) -> List[str]: # Return curie synonyms From 5888094500914bfd30b2dfaee8f4ad29b30173a1 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Wed, 20 Dec 2023 09:11:35 -0500 Subject: [PATCH 14/46] avoid deep copy --- src/dug/core/parsers/_base.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/dug/core/parsers/_base.py b/src/dug/core/parsers/_base.py index da6e726..d1272e7 100644 --- a/src/dug/core/parsers/_base.py +++ b/src/dug/core/parsers/_base.py @@ -30,10 +30,10 @@ def add_concept(self, concept): self.concepts[concept.id] = concept def jsonable(self): - copy_dict = copy.deepcopy(self.__dict__) + dict_style = self.__dict__ concepts = {k: v.jsonable() for k, v in self.concepts.items()} - copy_dict['concepts'] = concepts - return copy_dict + dict_style['concepts'] = concepts + return dict_style def get_searchable_dict(self): @@ -137,10 +137,10 @@ def get_searchable_dict(self): return es_conc def jsonable(self): - copy_dict = copy.deepcopy(self.__dict__) identifiers = {k: v.jsonable() for k, v in self.identifiers.items()} - copy_dict['identifiers'] = identifiers - return copy_dict + dict_style = self.__dict__ + dict_style['identifiers'] = identifiers + return dict_style def __str__(self): return json.dumps(self.__dict__, indent=2, default=utils.complex_handler) From 527fbb851adf7a4a5555552a4d45f1e03e3a9bcb Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Wed, 20 Dec 2023 09:32:12 -0500 Subject: [PATCH 15/46] see if this helps --- src/dug/core/parsers/_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/dug/core/parsers/_base.py b/src/dug/core/parsers/_base.py index d1272e7..26037a0 100644 --- a/src/dug/core/parsers/_base.py +++ b/src/dug/core/parsers/_base.py @@ -29,9 +29,9 @@ def __init__(self, elem_id="", name="", desc="", description="", elem_type="", i def add_concept(self, concept): self.concepts[concept.id] = concept - def jsonable(self): - dict_style = self.__dict__ + def jsonable(self): concepts = {k: v.jsonable() for k, v in self.concepts.items()} + dict_style = self.__dict__ dict_style['concepts'] = concepts return dict_style From 17893a031b8e919ef21fd0946ffb88355b8ee2b5 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Wed, 20 Dec 2023 12:37:23 -0500 Subject: [PATCH 16/46] shallow copy and dump --- src/dug/core/parsers/_base.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/dug/core/parsers/_base.py b/src/dug/core/parsers/_base.py index 26037a0..f8ba301 100644 --- a/src/dug/core/parsers/_base.py +++ b/src/dug/core/parsers/_base.py @@ -31,8 +31,13 @@ def add_concept(self, concept): def jsonable(self): concepts = {k: v.jsonable() for k, v in self.concepts.items()} - dict_style = self.__dict__ - dict_style['concepts'] = concepts + dict_style = {} + # make a shallow copy + for k, v in self.__dict__.items(): + if k == 'concepts': + dict_style[k] = concepts + continue + dict_style[k] = v return dict_style @@ -138,8 +143,12 @@ def get_searchable_dict(self): def jsonable(self): identifiers = {k: v.jsonable() for k, v in self.identifiers.items()} - dict_style = self.__dict__ - dict_style['identifiers'] = identifiers + dict_style = {} + for k, v in self.__dict__.items(): + if k == 'identifiers': + dict_style[k] = identifiers + continue + dict_style[k] = v return dict_style def __str__(self): From 1aa475fc8e83878883158af0519c8e7deef5919d Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Wed, 20 Dec 2023 17:16:35 -0500 Subject: [PATCH 17/46] logging for crawler --- src/dug/core/crawler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/dug/core/crawler.py b/src/dug/core/crawler.py index 1b5b877..32d96bb 100644 --- a/src/dug/core/crawler.py +++ b/src/dug/core/crawler.py @@ -146,6 +146,7 @@ def annotate_element(self, element): http_session=self.http_session) # Each identifier then becomes a concept that links elements together + logger.info("Got %d identifiers for %s", len(identifiers) , element.ml_ready_desc) for identifier in identifiers: if identifier.id not in self.concepts: # Create concept for newly seen identifier From b5405ebf2a3b76a266927254eac5b99e7082b8b3 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Wed, 20 Dec 2023 22:38:02 -0500 Subject: [PATCH 18/46] reverting cause of memory leak --- src/dug/core/annotate.py | 15 ++++----- src/dug/core/parsers/_base.py | 59 +++++++++++++---------------------- 2 files changed, 27 insertions(+), 47 deletions(-) diff --git a/src/dug/core/annotate.py b/src/dug/core/annotate.py index d3066a1..6294526 100644 --- a/src/dug/core/annotate.py +++ b/src/dug/core/annotate.py @@ -18,20 +18,17 @@ class Identifier: - def __init__(self, id, label, types=None, search_text=[], description="", equivalent_identifiers=[], synonyms=[], purl = []): + def __init__(self, id, label, types=None, search_text="", description=""): self.id = id self.label = label self.description = description if types is None: types = [] self.types = types - if isinstance(search_text, str): - self.search_text = [search_text] if search_text else [] - elif isinstance(search_text, list): - self.search_text = search_text - self.equivalent_identifiers = equivalent_identifiers - self.synonyms = synonyms - self.purl = purl + self.search_text = [search_text] if search_text else [] + self.equivalent_identifiers = [] + self.synonyms = [] + self.purl = "" @property def id_type(self): @@ -620,4 +617,4 @@ def get_curie_purl(curie): if __name__ == "__main__": import doctest - doctest.testmod() + doctest.testmod() \ No newline at end of file diff --git a/src/dug/core/parsers/_base.py b/src/dug/core/parsers/_base.py index f8ba301..43a1801 100644 --- a/src/dug/core/parsers/_base.py +++ b/src/dug/core/parsers/_base.py @@ -1,6 +1,5 @@ import json from typing import Union, Callable, Any, Iterable -import copy from dug.core.loaders import InputFile @@ -11,35 +10,26 @@ class DugElement: # Basic class for holding information for an object you want to make searchable via Dug # Could be a DbGaP variable, DICOM image, App, or really anything # Optionally can hold information pertaining to a containing collection (e.g. dbgap study or dicom image series) - def __init__(self, elem_id="", name="", desc="", description="", elem_type="", id="", type= "", collection_id="", collection_name="", collection_desc="", action="", collection_action="", concepts={}, ml_ready_desc="", search_terms=[], optional_terms=[]): - self.id = elem_id or id + def __init__(self, elem_id, name, desc, elem_type, collection_id="", collection_name="", collection_desc="", action="", collection_action=""): + self.id = elem_id self.name = name - self.description = desc or description - self.type = elem_type or type + self.description = desc + self.type = elem_type self.collection_id = collection_id self.collection_name = collection_name self.collection_desc = collection_desc self.action = action self.collection_action = collection_action - self.concepts = concepts - self.ml_ready_desc = ml_ready_desc or desc - self.search_terms = search_terms - self.optional_terms = optional_terms + self.concepts = {} + self.ml_ready_desc = desc + self.search_terms = [] + self.optional_terms = [] def add_concept(self, concept): self.concepts[concept.id] = concept - def jsonable(self): - concepts = {k: v.jsonable() for k, v in self.concepts.items()} - dict_style = {} - # make a shallow copy - for k, v in self.__dict__.items(): - if k == 'concepts': - dict_style[k] = concepts - continue - dict_style[k] = v - return dict_style - + def jsonable(self): + return self.__dict__ def get_searchable_dict(self): # Translate DugElement to ES-style dict @@ -83,17 +73,17 @@ def __str__(self): class DugConcept: # Basic class for holding information about concepts that are used to organize elements # All Concepts map to at least one element - def __init__(self, concept_id="", name="", desc="", concept_type="", id="" , description="", type="", concept_action="", identifiers = {}, kg_answers={}, search_terms = [] , optional_terms=[], ml_ready_desc=""): - self.id = concept_id or id + def __init__(self, concept_id, name, desc, concept_type): + self.id = concept_id self.name = name - self.description = desc or description - self.type = concept_type or type + self.description = desc + self.type = concept_type self.concept_action = "" - self.identifiers = identifiers - self.kg_answers = kg_answers - self.search_terms = search_terms - self.optional_terms = optional_terms - self.ml_ready_desc = desc or ml_ready_desc + self.identifiers = {} + self.kg_answers = {} + self.search_terms = [] + self.optional_terms = [] + self.ml_ready_desc = desc def add_identifier(self, ident): if ident.id in self.identifiers: @@ -142,14 +132,7 @@ def get_searchable_dict(self): return es_conc def jsonable(self): - identifiers = {k: v.jsonable() for k, v in self.identifiers.items()} - dict_style = {} - for k, v in self.__dict__.items(): - if k == 'identifiers': - dict_style[k] = identifiers - continue - dict_style[k] = v - return dict_style + return self.__dict__ def __str__(self): return json.dumps(self.__dict__, indent=2, default=utils.complex_handler) @@ -159,4 +142,4 @@ def __str__(self): Parser = Callable[[Any], Iterable[Indexable]] -FileParser = Callable[[InputFile], Iterable[Indexable]] +FileParser = Callable[[InputFile], Iterable[Indexable]] \ No newline at end of file From f1950e0bd63168c5b55aeedf52be338385d80f4d Mon Sep 17 00:00:00 2001 From: YaphetKG <45075777+YaphetKG@users.noreply.github.com> Date: Thu, 21 Dec 2023 18:37:24 -0500 Subject: [PATCH 19/46] debug message for tranql --- src/dug/core/annotate.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/dug/core/annotate.py b/src/dug/core/annotate.py index 6294526..e605292 100644 --- a/src/dug/core/annotate.py +++ b/src/dug/core/annotate.py @@ -135,7 +135,7 @@ def expand_identifier(self, identifier, query_factory, kg_filename, include_all_ response = json.load(stream) else: query = query_factory.get_query(identifier) - logger.debug(query) + logger.info(query) response = requests.post( url=self.url, headers=self.tranql_headers, @@ -149,6 +149,9 @@ def expand_identifier(self, identifier, query_factory, kg_filename, include_all_ return [] except KeyError as e: logger.error(f"Could not find key: {e} in response: {response}") + except Exception as ex: + logger.error(response) + raise ex # Dump out to file if there's a knowledge graph with open(kg_filename, 'w') as stream: @@ -617,4 +620,4 @@ def get_curie_purl(curie): if __name__ == "__main__": import doctest - doctest.testmod() \ No newline at end of file + doctest.testmod() From 9ae8e0aa945301df2718e65e8e959199c665b1d8 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Thu, 4 Jan 2024 10:05:17 -0500 Subject: [PATCH 20/46] remove annotate commented out code, backdrop python min requriement --- setup.cfg | 2 +- src/dug/core/annotate.py | 612 --------------------------------------- 2 files changed, 1 insertion(+), 613 deletions(-) delete mode 100644 src/dug/core/annotate.py diff --git a/setup.cfg b/setup.cfg index b551ef3..75fe4d2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -17,7 +17,7 @@ classifiers = package_dir = = src packages = find: -python_requires = >=3.12 +python_requires = >=3.11 include_package_data = true install_requires = elasticsearch==8.5.2 diff --git a/src/dug/core/annotate.py b/src/dug/core/annotate.py deleted file mode 100644 index 29fa85f..0000000 --- a/src/dug/core/annotate.py +++ /dev/null @@ -1,612 +0,0 @@ -# import json -# import logging -# import os -# import re -# import urllib.parse -# from typing import TypeVar, Generic, Union, List, Tuple, Optional -# import bmt -# import requests -# from requests import Session - -# import dug.core.tranql as tql - - -# logger = logging.getLogger('dug') - -# logging.getLogger("requests").setLevel(logging.WARNING) -# logging.getLogger("urllib3").setLevel(logging.WARNING) - - -# class Identifier: -# def __init__(self, id, label, types=None, search_text="", description=""): -# self.id = id -# self.label = label -# self.description = description -# if types is None: -# types = [] -# self.types = types -# self.search_text = [search_text] if search_text else [] -# self.equivalent_identifiers = [] -# self.synonyms = [] -# self.purl = "" - -# @property -# def id_type(self): -# return self.id.split(":")[0] - -# def add_search_text(self, text): -# # Add text only if it's unique and if not empty string -# if text and text not in self.search_text: -# self.search_text.append(text) - -# def get_searchable_dict(self): -# # Return a version of the identifier compatible with what's in ElasticSearch -# es_ident = { -# 'id': self.id, -# 'label': self.label, -# 'equivalent_identifiers': self.equivalent_identifiers, -# 'type': self.types, -# 'synonyms': self.synonyms -# } -# return es_ident - -# def jsonable(self): -# return self.__dict__ - - -# class DugAnnotator: -# def __init__( -# self, -# preprocessor: "Preprocessor", -# annotator: "Annotator", -# normalizer: "Normalizer", -# synonym_finder: "SynonymFinder", -# ontology_greenlist=[], -# ): -# self.preprocessor = preprocessor -# self.annotator = annotator -# self.normalizer = normalizer -# self.synonym_finder = synonym_finder -# self.ontology_greenlist = ontology_greenlist -# self.norm_fails_file = "norm_fails.txt" -# self.anno_fails_file = "anno_fails.txt" - -# def annotate(self, text, http_session): - -# # Preprocess text (debraviate, remove stopwords, etc.) -# text = self.preprocessor.preprocess(text) - -# # Fetch identifiers -# raw_identifiers = self.annotator.annotate(text, http_session) - -# # Write out to file if text fails to annotate -# if not raw_identifiers: -# with open(self.anno_fails_file, "a") as fh: -# fh.write(f'{text}\n') - -# processed_identifiers = [] -# for identifier in raw_identifiers: - -# # Normalize identifier using normalization service -# norm_id = self.normalizer.normalize(identifier, http_session) - -# # Skip adding id if it doesn't normalize -# if norm_id is None: -# # Write out to file if identifier doesn't normalize -# with open(self.norm_fails_file, "a") as fh: -# fh.write(f'{identifier.id}\n') - -# # Discard non-normalized ident if not in greenlist -# if identifier.id_type not in self.ontology_greenlist: -# continue - -# # If it is in greenlist just keep moving forward -# norm_id = identifier - -# # Add synonyms to identifier -# norm_id.synonyms = self.synonym_finder.get_synonyms(norm_id.id, http_session) - -# # Get pURL for ontology identifer for more info -# norm_id.purl = BioLinkPURLerizer.get_curie_purl(norm_id.id) -# processed_identifiers.append(norm_id) - -# return processed_identifiers - - -# class ConceptExpander: -# def __init__(self, url, min_tranql_score=0.2): -# self.url = url -# self.min_tranql_score = min_tranql_score -# self.include_node_keys = ["id", "name", "synonyms"] -# self.include_edge_keys = [] -# self.tranql_headers = {"accept": "application/json", "Content-Type": "text/plain"} - -# def is_acceptable_answer(self, answer): -# return True - -# def expand_identifier(self, identifier, query_factory, kg_filename, include_all_attributes=False): - -# answer_kgs = [] - -# # Skip TranQL query if a file exists in the crawlspace exists already, but continue w/ answers -# if os.path.exists(kg_filename): -# logger.info(f"identifier {identifier} is already crawled. Skipping TranQL query.") -# with open(kg_filename, 'r') as stream: -# response = json.load(stream) -# else: -# query = query_factory.get_query(identifier) -# logger.debug(query) -# response = requests.post( -# url=self.url, -# headers=self.tranql_headers, -# data=query).json() - -# # Case: Skip if empty KG -# try: -# if response["message"] == 'Internal Server Error' or len(response["message"]["knowledge_graph"]["nodes"]) == 0: -# logger.debug(f"Did not find a knowledge graph for {query}") -# logger.debug(f"{self.url} returned response: {response}") -# return [] -# except KeyError as e: -# logger.error(f"Could not find key: {e} in response: {response}") - -# # Dump out to file if there's a knowledge graph -# with open(kg_filename, 'w') as stream: -# json.dump(response, stream, indent=2) - -# # Get nodes in knowledge graph hashed by ids for easy lookup -# noMessage = (len(response.get("message",{})) == 0) -# statusError = (response.get("status","") == 'Error') -# if noMessage or statusError: -# # Skip on error -# logger.info(f"Error with identifier: {identifier}, response: {response}, kg_filename: '{kg_filename}'") -# return [] -# kg = tql.QueryKG(response) - -# for answer in kg.answers: -# # Filter out answers that don't meet some criteria -# # Right now just don't filter anything -# logger.debug(f"Answer: {answer}") -# if not self.is_acceptable_answer(answer): -# logger.warning("Skipping answer as it failed one or more acceptance criteria. See log for details.") -# continue - -# # Get subgraph containing only information for this answer -# try: -# # Temporarily surround in try/except because sometimes the answer graphs -# # contain invalid references to edges/nodes -# # This will be fixed in Robokop but for now just silently warn if answer is invalid -# node_attributes_filter = None if include_all_attributes else self.include_node_keys -# edge_attributes_filter = None if include_all_attributes else self.include_edge_keys -# answer_kg = kg.get_answer_subgraph(answer, -# include_node_keys=node_attributes_filter, -# include_edge_keys=edge_attributes_filter) - -# # Add subgraph to list of acceptable answers to query -# answer_kgs.append(answer_kg) - -# except tql.MissingNodeReferenceError: -# # TEMPORARY: Skip answers that have invalid node references -# # Need this to be fixed in Robokop -# logger.warning("Skipping answer due to presence of non-preferred id! " -# "See err msg for details.") -# continue -# except tql.MissingEdgeReferenceError: -# # TEMPORARY: Skip answers that have invalid edge references -# # Need this to be fixed in Robokop -# logger.warning("Skipping answer due to presence of invalid edge reference! " -# "See err msg for details.") -# continue - -# return answer_kgs - - -# class Preprocessor: -# """"Class for preprocessing strings so they are better interpreted by NLP steps""" - -# def __init__(self, debreviator=None, stopwords=None): -# if debreviator is None: -# debreviator = self.default_debreviator_factory() -# self.decoder = debreviator - -# if stopwords is None: -# stopwords = [] -# self.stopwords = stopwords - -# def preprocess(self, text: str) -> str: -# """ -# Apply debreviator to replace abbreviations and other characters - -# >>> pp = Preprocessor({"foo": "bar"}, ["baz"]) -# >>> pp.preprocess("Hello foo") -# 'Hello bar' - -# >>> pp.preprocess("Hello baz world") -# 'Hello world' -# """ - -# for key, value in self.decoder.items(): -# text = text.replace(key, value) - -# # Remove any stopwords -# text = " ".join([word for word in text.split() if word not in self.stopwords]) -# return text - -# @staticmethod -# def default_debreviator_factory(): -# return {"bmi": "body mass index", "_": " "} - - -# Input = TypeVar("Input") -# Output = TypeVar("Output") - - -# class ApiClient(Generic[Input, Output]): - -# def make_request(self, value: Input, http_session: Session): -# raise NotImplementedError() - -# def handle_response(self, value, response: Union[dict, list]) -> Output: -# raise NotImplementedError() - -# def __call__(self, value: Input, http_session: Session) -> Output: -# response = self.make_request(value, http_session) - -# result = self.handle_response(value, response) - -# return result - - -# class Annotator(ApiClient[str, List[Identifier]]): -# """ -# Use monarch API service to fetch ontology IDs found in text -# """ - -# def __init__(self, url: str): -# self.url = url - -# def sliding_window(self, text, max_characters=2000, padding_words=5): -# """ -# For long texts sliding window works as the following -# "aaaa bbb ccc ddd eeee" -# with a sliding max chars 8 and padding 1 -# first yeild would be "aaaa bbb" -# next subsequent yeilds "bbb ccc", "ccc ddd" , "ddd eeee" -# allowing context to be preserved with the scope of padding -# For a text of length 7653 , with max_characters 2000 and padding 5 , 4 chunks are yielded. -# """ -# words = text.split(' ') -# total_words = len(words) -# window_end = False -# current_index = 0 -# while not window_end: -# current_string = "" -# for index, word in enumerate(words[current_index: ]): -# if len(current_string) + len(word) + 1 >= max_characters: -# yield current_string + " " -# current_index += index - padding_words -# break -# appendee = word if index == 0 else " " + word -# current_string += appendee - -# if current_index + index == len(words) - 1: -# window_end = True -# yield current_string - -# def annotate(self, text, http_session): -# logger.debug(f"Annotating: {text}") -# identifiers = [] -# for chunk_text in self.sliding_window(text): -# identifiers += self(chunk_text, http_session) -# return identifiers - -# def make_request(self, value: Input, http_session: Session): -# value = urllib.parse.quote(value) -# url = f'{self.url}{value}' - -# # This could be moved to a config file -# NUM_TRIES = 5 -# for _ in range(NUM_TRIES): -# response = http_session.get(url) -# if response is not None: -# # looks like it worked -# break - -# # if the reponse is still None here, throw an error -# if response is None: -# raise RuntimeError(f"no response from {url}") -# return response.json() - -# def handle_response(self, value, response: dict) -> List[Identifier]: -# identifiers = [] -# """ Parse each identifier and initialize identifier object """ -# for span in response.get('spans', []): -# search_text = span.get('text', None) -# for token in span.get('token', []): -# curie = token.get('id', None) -# if not curie: -# continue - -# biolink_types = token.get('category') -# label = token.get('terms')[0] -# identifiers.append(Identifier(id=curie, -# label=label, -# types=biolink_types, -# search_text=search_text)) -# return identifiers - - -# class Normalizer(ApiClient[Identifier, Identifier]): -# def __init__(self, url): -# self.bl_toolkit = bmt.Toolkit() -# self.url = url - -# def normalize(self, identifier: Identifier, http_session: Session): -# # Use RENCI's normalization API service to get the preferred version of an identifier -# logger.debug(f"Normalizing: {identifier.id}") -# return self(identifier, http_session) - -# def make_request(self, value: Identifier, http_session: Session) -> dict: -# curie = value.id -# url = f"{self.url}{urllib.parse.quote(curie)}" -# try: -# response = http_session.get(url) -# except Exception as get_exc: -# logger.info(f"Error normalizing {value} at {url}") -# logger.error(f"Error {get_exc.__class__.__name__}: {get_exc}") -# return {} -# try: -# normalized = response.json() -# except Exception as json_exc: -# logger.info(f"Error processing response: {response.text} (HTTP {response.status_code})") -# logger.error(f"Error {json_exc.__class__.__name__}: {json_exc}") -# return {} - -# return normalized - -# def handle_response(self, identifier: Identifier, normalized: dict) -> Optional[Identifier]: -# """ Record normalized results. """ -# curie = identifier.id -# normalization = normalized.get(curie, {}) -# if normalization is None: -# logger.info(f"Normalization service did not return normalization for: {curie}") -# return None - -# preferred_id = normalization.get("id", {}) -# equivalent_identifiers = normalization.get("equivalent_identifiers", []) -# biolink_type = normalization.get("type", []) - -# # Return none if there isn't actually a preferred id -# if 'identifier' not in preferred_id: -# logger.debug(f"ERROR: normalize({curie})=>({preferred_id}). No identifier?") -# return None - -# logger.debug(f"Preferred id: {preferred_id}") -# identifier.id = preferred_id.get('identifier', '') -# identifier.label = preferred_id.get('label', '') -# identifier.description = preferred_id.get('description', '') -# identifier.equivalent_identifiers = [v['identifier'] for v in equivalent_identifiers] -# try: -# identifier.types = self.bl_toolkit.get_element(biolink_type[0]).name -# except: -# # converts biolink:SmallMolecule to small molecule -# identifier.types = (" ".join(re.split("(?=[A-Z])", biolink_type[0].replace('biolink:', ''))[1:])).lower() -# return identifier - - -# class SynonymFinder(ApiClient[str, List[str]]): - -# def __init__(self, url: str): -# self.url = url - -# def get_synonyms(self, curie: str, http_session): -# ''' -# This function uses the NCATS translator service to return a list of synonyms for -# curie id -# ''' - -# return self(curie, http_session) - -# def make_request(self, curie: str, http_session: Session): -# # Get response from namelookup reverse lookup op -# # example (https://name-resolution-sri.renci.org/docs#/lookup/lookup_names_reverse_lookup_post) -# url = f"{self.url}" -# payload = { -# 'curies': [curie] -# } -# try: -# response = http_session.post(url, json=payload) -# if str(response.status_code).startswith('4'): -# logger.error(f"No synonyms returned for: `{curie}`. Validation error: {response.text}") -# return {curie: []} -# if str(response.status_code).startswith('5'): -# logger.error(f"No synonyms returned for: `{curie}`. Internal server error from {self.url}. Error: {response.text}") -# return {curie: []} -# return response.json() -# except json.decoder.JSONDecodeError as e: -# logger.error(f"Json parse error for response from `{url}`. Exception: {str(e)}") -# return {curie: []} - -# def handle_response(self, curie: str, raw_synonyms: List[dict]) -> List[str]: -# # Return curie synonyms -# return raw_synonyms.get(curie, []) - - - - - -# class BioLinkPURLerizer: -# # Static class for the sole purpose of doing lookups of different ontology PURLs -# # Is it pretty? No. But it gets the job done. -# biolink_lookup = {"APO": "http://purl.obolibrary.org/obo/APO_", -# "Aeolus": "http://translator.ncats.nih.gov/Aeolus_", -# "BIOGRID": "http://identifiers.org/biogrid/", -# "BIOSAMPLE": "http://identifiers.org/biosample/", -# "BSPO": "http://purl.obolibrary.org/obo/BSPO_", -# "CAID": "http://reg.clinicalgenome.org/redmine/projects/registry/genboree_registry/by_caid?caid=", -# "CHEBI": "http://purl.obolibrary.org/obo/CHEBI_", -# "CHEMBL.COMPOUND": "http://identifiers.org/chembl.compound/", -# "CHEMBL.MECHANISM": "https://www.ebi.ac.uk/chembl/mechanism/inspect/", -# "CHEMBL.TARGET": "http://identifiers.org/chembl.target/", -# "CID": "http://pubchem.ncbi.nlm.nih.gov/compound/", -# "CL": "http://purl.obolibrary.org/obo/CL_", -# "CLINVAR": "http://identifiers.org/clinvar/", -# "CLO": "http://purl.obolibrary.org/obo/CLO_", -# "COAR_RESOURCE": "http://purl.org/coar/resource_type/", -# "CPT": "https://www.ama-assn.org/practice-management/cpt/", -# "CTD": "http://translator.ncats.nih.gov/CTD_", -# "ClinVarVariant": "http://www.ncbi.nlm.nih.gov/clinvar/variation/", -# "DBSNP": "http://identifiers.org/dbsnp/", -# "DGIdb": "https://www.dgidb.org/interaction_types", -# "DOID": "http://purl.obolibrary.org/obo/DOID_", -# "DRUGBANK": "http://identifiers.org/drugbank/", -# "DrugCentral": "http://translator.ncats.nih.gov/DrugCentral_", -# "EC": "http://www.enzyme-database.org/query.php?ec=", -# "ECTO": "http://purl.obolibrary.org/obo/ECTO_", -# "EDAM-DATA": "http://edamontology.org/data_", -# "EDAM-FORMAT": "http://edamontology.org/format_", -# "EDAM-OPERATION": "http://edamontology.org/operation_", -# "EDAM-TOPIC": "http://edamontology.org/topic_", -# "EFO": "http://identifiers.org/efo/", -# "ENSEMBL": "http://identifiers.org/ensembl/", -# "ExO": "http://purl.obolibrary.org/obo/ExO_", -# "FAO": "http://purl.obolibrary.org/obo/FAO_", -# "FB": "http://identifiers.org/fb/", -# "FBcv": "http://purl.obolibrary.org/obo/FBcv_", -# "FlyBase": "http://flybase.org/reports/", -# "GAMMA": "http://translator.renci.org/GAMMA_", -# "GO": "http://purl.obolibrary.org/obo/GO_", -# "GOLD.META": "http://identifiers.org/gold.meta/", -# "GOP": "http://purl.obolibrary.org/obo/go#", -# "GOREL": "http://purl.obolibrary.org/obo/GOREL_", -# "GSID": "https://scholar.google.com/citations?user=", -# "GTEx": "https://www.gtexportal.org/home/gene/", -# "HANCESTRO": "http://www.ebi.ac.uk/ancestro/ancestro_", -# "HCPCS": "http://purl.bioontology.org/ontology/HCPCS/", -# "HGNC": "http://identifiers.org/hgnc/", -# "HGNC.FAMILY": "http://identifiers.org/hgnc.family/", -# "HMDB": "http://identifiers.org/hmdb/", -# "HP": "http://purl.obolibrary.org/obo/HP_", -# "ICD0": "http://translator.ncats.nih.gov/ICD0_", -# "ICD10": "http://translator.ncats.nih.gov/ICD10_", -# "ICD9": "http://translator.ncats.nih.gov/ICD9_", -# "INCHI": "http://identifiers.org/inchi/", -# "INCHIKEY": "http://identifiers.org/inchikey/", -# "INTACT": "http://identifiers.org/intact/", -# "IUPHAR.FAMILY": "http://identifiers.org/iuphar.family/", -# "KEGG": "http://identifiers.org/kegg/", -# "LOINC": "http://loinc.org/rdf/", -# "MEDDRA": "http://identifiers.org/meddra/", -# "MESH": "http://identifiers.org/mesh/", -# "MGI": "http://identifiers.org/mgi/", -# "MI": "http://purl.obolibrary.org/obo/MI_", -# "MIR": "http://identifiers.org/mir/", -# "MONDO": "http://purl.obolibrary.org/obo/MONDO_", -# "MP": "http://purl.obolibrary.org/obo/MP_", -# "MSigDB": "https://www.gsea-msigdb.org/gsea/msigdb/", -# "MetaCyc": "http://translator.ncats.nih.gov/MetaCyc_", -# "NCBIGENE": "http://identifiers.org/ncbigene/", -# "NCBITaxon": "http://purl.obolibrary.org/obo/NCBITaxon_", -# "NCIT": "http://purl.obolibrary.org/obo/NCIT_", -# "NDDF": "http://purl.bioontology.org/ontology/NDDF/", -# "NLMID": "https://www.ncbi.nlm.nih.gov/nlmcatalog/?term=", -# "OBAN": "http://purl.org/oban/", -# "OBOREL": "http://purl.obolibrary.org/obo/RO_", -# "OIO": "http://www.geneontology.org/formats/oboInOwl#", -# "OMIM": "http://purl.obolibrary.org/obo/OMIM_", -# "ORCID": "https://orcid.org/", -# "ORPHA": "http://www.orpha.net/ORDO/Orphanet_", -# "ORPHANET": "http://identifiers.org/orphanet/", -# "PANTHER.FAMILY": "http://identifiers.org/panther.family/", -# "PANTHER.PATHWAY": "http://identifiers.org/panther.pathway/", -# "PATO-PROPERTY": "http://purl.obolibrary.org/obo/pato#", -# "PDQ": "https://www.cancer.gov/publications/pdq#", -# "PHARMGKB.DRUG": "http://identifiers.org/pharmgkb.drug/", -# "PHARMGKB.PATHWAYS": "http://identifiers.org/pharmgkb.pathways/", -# "PHAROS": "http://pharos.nih.gov", -# "PMID": "http://www.ncbi.nlm.nih.gov/pubmed/", -# "PO": "http://purl.obolibrary.org/obo/PO_", -# "POMBASE": "http://identifiers.org/pombase/", -# "PR": "http://purl.obolibrary.org/obo/PR_", -# "PUBCHEM.COMPOUND": "http://identifiers.org/pubchem.compound/", -# "PUBCHEM.SUBSTANCE": "http://identifiers.org/pubchem.substance/", -# "PathWhiz": "http://smpdb.ca/pathways/#", -# "REACT": "http://www.reactome.org/PathwayBrowser/#/", -# "REPODB": "http://apps.chiragjpgroup.org/repoDB/", -# "RGD": "http://identifiers.org/rgd/", -# "RHEA": "http://identifiers.org/rhea/", -# "RNACENTRAL": "http://identifiers.org/rnacentral/", -# "RO": "http://purl.obolibrary.org/obo/RO_", -# "RTXKG1": "http://kg1endpoint.rtx.ai/", -# "RXNORM": "http://purl.bioontology.org/ontology/RXNORM/", -# "ResearchID": "https://publons.com/researcher/", -# "SEMMEDDB": "https://skr3.nlm.nih.gov/SemMedDB", -# "SGD": "http://identifiers.org/sgd/", -# "SIO": "http://semanticscience.org/resource/SIO_", -# "SMPDB": "http://identifiers.org/smpdb/", -# "SNOMEDCT": "http://identifiers.org/snomedct/", -# "SNPEFF": "http://translator.ncats.nih.gov/SNPEFF_", -# "ScopusID": "https://www.scopus.com/authid/detail.uri?authorId=", -# "TAXRANK": "http://purl.obolibrary.org/obo/TAXRANK_", -# "UBERGRAPH": "http://translator.renci.org/ubergraph-axioms.ofn#", -# "UBERON": "http://purl.obolibrary.org/obo/UBERON_", -# "UBERON_CORE": "http://purl.obolibrary.org/obo/uberon/core#", -# "UMLS": "http://identifiers.org/umls/", -# "UMLSSC": "https://metamap.nlm.nih.gov/Docs/SemanticTypes_2018AB.txt/code#", -# "UMLSSG": "https://metamap.nlm.nih.gov/Docs/SemGroups_2018.txt/group#", -# "UMLSST": "https://metamap.nlm.nih.gov/Docs/SemanticTypes_2018AB.txt/type#", -# "UNII": "http://identifiers.org/unii/", -# "UPHENO": "http://purl.obolibrary.org/obo/UPHENO_", -# "UniProtKB": "http://identifiers.org/uniprot/", -# "VANDF": "https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/VANDF/", -# "VMC": "https://github.com/ga4gh/vr-spec/", -# "WB": "http://identifiers.org/wb/", -# "WBPhenotype": "http://purl.obolibrary.org/obo/WBPhenotype_", -# "WBVocab": "http://bio2rdf.org/wormbase_vocabulary", -# "WIKIDATA": "https://www.wikidata.org/wiki/", -# "WIKIDATA_PROPERTY": "https://www.wikidata.org/wiki/Property:", -# "WIKIPATHWAYS": "http://identifiers.org/wikipathways/", -# "WormBase": "https://www.wormbase.org/get?name=", -# "ZFIN": "http://identifiers.org/zfin/", -# "ZP": "http://purl.obolibrary.org/obo/ZP_", -# "alliancegenome": "https://www.alliancegenome.org/", -# "biolink": "https://w3id.org/biolink/vocab/", -# "biolinkml": "https://w3id.org/biolink/biolinkml/", -# "chembio": "http://translator.ncats.nih.gov/chembio_", -# "dcterms": "http://purl.org/dc/terms/", -# "dictyBase": "http://dictybase.org/gene/", -# "doi": "https://doi.org/", -# "fabio": "http://purl.org/spar/fabio/", -# "foaf": "http://xmlns.com/foaf/0.1/", -# "foodb.compound": "http://foodb.ca/compounds/", -# "gff3": "https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md#", -# "gpi": "https://github.com/geneontology/go-annotation/blob/master/specs/gpad-gpi-2-0.md#", -# "gtpo": "https://rdf.guidetopharmacology.org/ns/gtpo#", -# "hetio": "http://translator.ncats.nih.gov/hetio_", -# "interpro": "https://www.ebi.ac.uk/interpro/entry/", -# "isbn": "https://www.isbn-international.org/identifier/", -# "isni": "https://isni.org/isni/", -# "issn": "https://portal.issn.org/resource/ISSN/", -# "medgen": "https://www.ncbi.nlm.nih.gov/medgen/", -# "oboformat": "http://www.geneontology.org/formats/oboInOWL#", -# "pav": "http://purl.org/pav/", -# "prov": "http://www.w3.org/ns/prov#", -# "qud": "http://qudt.org/1.1/schema/qudt#", -# "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", -# "rdfs": "http://www.w3.org/2000/01/rdf-schema#", -# "skos": "https://www.w3.org/TR/skos-reference/#", -# "wgs": "http://www.w3.org/2003/01/geo/wgs84_pos", -# "xsd": "http://www.w3.org/2001/XMLSchema#", -# "@vocab": "https://w3id.org/biolink/vocab/"} - -# @staticmethod -# def get_curie_purl(curie): -# # Split into prefix and suffix -# suffix = curie.split(":")[1] -# prefix = curie.split(":")[0] - -# # Check to see if the prefix exists in the hash -# if prefix not in BioLinkPURLerizer.biolink_lookup: -# return None - -# return f"{BioLinkPURLerizer.biolink_lookup[prefix]}{suffix}" From a18670e3e463ae43272b92589ce9f8a8a244b68c Mon Sep 17 00:00:00 2001 From: braswent Date: Thu, 4 Jan 2024 12:28:17 -0500 Subject: [PATCH 21/46] feat: updated elasticsearch auth protocol to latest version --- setup.cfg | 2 +- src/dug/core/async_search.py | 4 ++-- src/dug/core/index.py | 4 ++-- tests/integration/conftest.py | 16 ++++++++-------- tests/integration/test_index.py | 2 +- tests/unit/conftest.py | 11 +++++------ tests/unit/test_api.py | 13 +++++++------ tests/unit/test_core/test_search.py | 3 ++- 8 files changed, 28 insertions(+), 27 deletions(-) diff --git a/setup.cfg b/setup.cfg index 75fe4d2..0df3d5d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -17,7 +17,7 @@ classifiers = package_dir = = src packages = find: -python_requires = >=3.11 +python_requires = >=3.10 include_package_data = true install_requires = elasticsearch==8.5.2 diff --git a/src/dug/core/async_search.py b/src/dug/core/async_search.py index 44d7c98..b39e6a9 100644 --- a/src/dug/core/async_search.py +++ b/src/dug/core/async_search.py @@ -50,12 +50,12 @@ def __init__(self, cfg: Config, indices=None): cafile=self._cfg.elastic_ca_path ) self.es = AsyncElasticsearch(hosts=self.hosts, - http_auth=(self._cfg.elastic_username, + basic_auth=(self._cfg.elastic_username, self._cfg.elastic_password), ssl_context=ssl_context) else: self.es = AsyncElasticsearch(hosts=self.hosts, - http_auth=(self._cfg.elastic_username, + basic_auth=(self._cfg.elastic_username, self._cfg.elastic_password)) async def dump_concepts(self, index, query={}, size=None, diff --git a/src/dug/core/index.py b/src/dug/core/index.py index 93a2d58..0491d06 100644 --- a/src/dug/core/index.py +++ b/src/dug/core/index.py @@ -30,12 +30,12 @@ def __init__(self, cfg: Config, indices=None): ) self.es = Elasticsearch( hosts=self.hosts, - http_auth=(self._cfg.elastic_username, self._cfg.elastic_password), + basic_auth=(self._cfg.elastic_username, self._cfg.elastic_password), ssl_context=ssl_context) else: self.es = Elasticsearch( hosts=self.hosts, - http_auth=(self._cfg.elastic_username, self._cfg.elastic_password)) + basic_auth=(self._cfg.elastic_username, self._cfg.elastic_password)) self.replicas = self.get_es_node_count() if self.es.ping(): diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 7bc0bcf..b671e3f 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -5,7 +5,7 @@ from dataclasses import dataclass from typing import Dict -import pytest +import pytest_asyncio TEST_DATA_DIR = Path(__file__).parent.resolve() / "data" @@ -45,7 +45,7 @@ def post(self, url, params: dict = None, json: dict = {}): return MockResponse(text, status_code=status_code) -@pytest.fixture +@pytest_asyncio.fixture def monarch_annotator_api(): base_url = "http://annotator.api/?content={query}" @@ -94,7 +94,7 @@ def _(keyword): ) -@pytest.fixture +@pytest_asyncio.fixture def token_classifier_api(): return MockApiService( urls={ @@ -118,7 +118,7 @@ def token_classifier_api(): ) -@pytest.fixture +@pytest_asyncio.fixture def sapbert_annotator_api(): return MockApiService( urls={ @@ -145,7 +145,7 @@ def sapbert_annotator_api(): ) -@pytest.fixture +@pytest_asyncio.fixture def normalizer_api(): base_url = "http://normalizer.api/?curie={curie}" @@ -188,7 +188,7 @@ def _(curie): ) -@pytest.fixture +@pytest_asyncio.fixture def null_normalizer_api(): base_url = "http://normalizer.api/?curie={curie}" @@ -211,7 +211,7 @@ def _(curie): ) -@pytest.fixture +@pytest_asyncio.fixture def synonym_api(): return MockApiService( urls={ @@ -234,7 +234,7 @@ def synonym_api(): ) -@pytest.fixture +@pytest_asyncio.fixture def null_synonym_api(): return MockApiService( urls={"http://synonyms.api": [json.dumps({"XAO:0000336": {"names":[]}}), 200]} diff --git a/tests/integration/test_index.py b/tests/integration/test_index.py index 31d0d3d..829e4ba 100644 --- a/tests/integration/test_index.py +++ b/tests/integration/test_index.py @@ -21,7 +21,7 @@ def is_elastic_up(): try: es = Elasticsearch( hosts=hosts, - http_auth=(username, password) + basic_auth=(username, password) ) return es.ping() except Exception: diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index f40d4f6..87f2edc 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -3,8 +3,7 @@ from dataclasses import dataclass from typing import Dict -import pytest - +import pytest_asyncio @dataclass class MockResponse: @@ -41,7 +40,7 @@ def post(self, url, params: dict = None, json: dict = {}): return MockResponse(text, status_code=status_code) -@pytest.fixture +@pytest_asyncio.fixture def annotator_api(): base_url = "http://annotator.api/?content={query}" @@ -150,7 +149,7 @@ def _(keyword): ) -@pytest.fixture +@pytest_asyncio.fixture def normalizer_api(): base_url = "http://normalizer.api/?curie={curie}" @@ -193,7 +192,7 @@ def _(curie): ) -@pytest.fixture +@pytest_asyncio.fixture def synonym_api(): return MockApiService( urls={ @@ -216,7 +215,7 @@ def synonym_api(): ) -@pytest.fixture() +@pytest_asyncio.fixture() def ontology_api(): base_url = "http://ontology.api/?curie={curie}" diff --git a/tests/unit/test_api.py b/tests/unit/test_api.py index e55b688..cd35ba3 100644 --- a/tests/unit/test_api.py +++ b/tests/unit/test_api.py @@ -6,24 +6,25 @@ import pytest pytest.skip("skipping as dug.api is no longer present", allow_module_level=True) from pytest import mark +import pytest_asyncio from dug.api import app, main, DugResource -@pytest.fixture +@pytest_asyncio.fixture def dug_api_test_client(): with app.test_client() as client: yield client -@pytest.fixture +@pytest_asyncio.fixture def mock_g_object(): with patch('dug.api.dug') as g: yield g -@pytest.fixture +@pytest_asyncio.fixture def mock_search_concepts(mock_g_object): mock_g_object().search_concepts.return_value = {'hits': {'hits': [ {'_type': '_doc', @@ -38,21 +39,21 @@ def mock_search_concepts(mock_g_object): }} -@pytest.fixture +@pytest_asyncio.fixture def mock_search_kg(mock_g_object): mock_g_object().search_kg.return_value = {'hits': {'hits': [ {'_type': '_doc', '_id': 'MEDDRA:10047249'} ]}} -@pytest.fixture +@pytest_asyncio.fixture def mock_search_variables(mock_g_object): mock_g_object().search_variables.return_value = {'hits': {'hits': [ {'_type': '_doc', '_id': 'MEDDRA:10047249'} ]}} -@pytest.fixture +@pytest_asyncio.fixture def mock_agg_data_types(mock_g_object): mock_g_object().agg_data_type.return_value = ["DBGaP"] diff --git a/tests/unit/test_core/test_search.py b/tests/unit/test_core/test_search.py index b7edc83..db7ed75 100644 --- a/tests/unit/test_core/test_search.py +++ b/tests/unit/test_core/test_search.py @@ -3,6 +3,7 @@ from unittest.mock import patch import pytest +import pytest_asyncio from dug.core.index import Index, SearchException from dug.config import Config @@ -95,7 +96,7 @@ def search(self, index, body, **kwargs): return {"results": {k: v for k, v in values.items() if body in v}} -@pytest.fixture +@pytest_asyncio.fixture def elastic(): with patch("dug.core.index.Elasticsearch") as es_class: es_instance = MockElastic(indices=MockIndices()) From 4c4977d83609faee2fbc8c505d23025d23c49352 Mon Sep 17 00:00:00 2001 From: braswent Date: Thu, 4 Jan 2024 13:05:57 -0500 Subject: [PATCH 22/46] feat: change annotator config to allow for different configs --- src/dug/config.py | 137 +++++++++++-------- src/dug/core/annotators/__init__.py | 12 +- src/dug/core/annotators/_base.py | 1 - src/dug/core/annotators/monarch_annotator.py | 4 +- src/dug/core/annotators/sapbert_annotator.py | 5 +- 5 files changed, 95 insertions(+), 64 deletions(-) diff --git a/src/dug/config.py b/src/dug/config.py index 5f4d59d..5f49e9e 100644 --- a/src/dug/config.py +++ b/src/dug/config.py @@ -9,8 +9,9 @@ @dataclass class Config: """ - TODO: Populate description + TODO: Populate description """ + elastic_password: str = "changeme" redis_password: str = "changeme" @@ -27,74 +28,102 @@ class Config: nboost_port: int = 8000 # Preprocessor config that will be passed to annotate.Preprocessor constructor - preprocessor: dict = field(default_factory=lambda: { - "debreviator": { - "BMI": "body mass index" - }, - "stopwords": ["the"] - }) - + preprocessor: dict = field( + default_factory=lambda: { + "debreviator": {"BMI": "body mass index"}, + "stopwords": ["the"], + } + ) + annotator_type: str = "annotator-monarch" # Annotator config that will be passed to annotate.Annotator constructor - annotator: dict = field(default_factory=lambda: { - "url": "https://api.monarchinitiative.org/api/nlp/annotate/entities?min_length=4&longest_only=false&include_abbreviation=false&include_acronym=false&include_numbers=false&content=" - }) + annotator_args: dict = field( + default_factory=lambda: { + "annotator-monarch": { + "url": "https://api.monarchinitiative.org/api/nlp/annotate/entities?min_length=4&longest_only=false&include_abbreviation=false&include_acronym=false&include_numbers=false&content=" + }, + "annotator-sapbert": { + "classificationUrl": "https://med-nemo.apps.renci.org/annotate/", + "annotatorUrl": "https://babel-sapbert.apps.renci.org/annotate/", + }, + } + ) # Normalizer config that will be passed to annotate.Normalizer constructor - normalizer: dict = field(default_factory=lambda: { - "url": "https://nodenormalization-dev.apps.renci.org/get_normalized_nodes?conflate=false&description=true&curie=" - }) + normalizer: dict = field( + default_factory=lambda: { + "url": "https://nodenormalization-dev.apps.renci.org/get_normalized_nodes?conflate=false&description=true&curie=" + } + ) # Synonym service config that will be passed to annotate.SynonymHelper constructor - synonym_service: dict = field(default_factory=lambda: { - "url": "https://name-resolution-sri.renci.org/reverse_lookup" - }) + synonym_service: dict = field( + default_factory=lambda: { + "url": "https://name-resolution-sri.renci.org/reverse_lookup" + } + ) # Ontology metadata helper config that will be passed to annotate.OntologyHelper constructor - ontology_helper: dict = field(default_factory=lambda: { - "url": "https://api.monarchinitiative.org/api/bioentity/" - }) + ontology_helper: dict = field( + default_factory=lambda: { + "url": "https://api.monarchinitiative.org/api/bioentity/" + } + ) # Redlist of identifiers not to expand via TranQL tranql_exclude_identifiers: list = field(default_factory=lambda: ["CHEBI:17336"]) - tranql_queries: dict = field(default_factory=lambda: { - "disease": ["disease", "phenotypic_feature"], - "pheno": ["phenotypic_feature", "disease"], - "anat": ["disease", "anatomical_entity"], - "chem_to_disease": ["chemical_entity", "disease"], - "small_molecule_to_disease": ["small_molecule", "disease"], - "chemical_mixture_to_disease": ["chemical_mixture", "disease"], - "phen_to_anat": ["phenotypic_feature", "anatomical_entity"], - }) - - node_to_element_queries: dict = field(default_factory=lambda: { - # Dug element type to cast the query kg nodes to - "cde": { - # Parse nodes matching criteria in kg - "node_type": "biolink:Publication", - "curie_prefix": "HEALCDE", - # list of attributes that are lists to be casted to strings - "list_field_choose_first": [ - "files" - ], - "attribute_mapping": { - # "DugElement Attribute" : "KG Node attribute" - "name": "name", - "desc": "summary", - "collection_name": "cde_category", - "collection_id": "cde_category", - "action": "files" + tranql_queries: dict = field( + default_factory=lambda: { + "disease": ["disease", "phenotypic_feature"], + "pheno": ["phenotypic_feature", "disease"], + "anat": ["disease", "anatomical_entity"], + "chem_to_disease": ["chemical_entity", "disease"], + "small_molecule_to_disease": ["small_molecule", "disease"], + "chemical_mixture_to_disease": ["chemical_mixture", "disease"], + "phen_to_anat": ["phenotypic_feature", "anatomical_entity"], + } + ) + + node_to_element_queries: dict = field( + default_factory=lambda: { + # Dug element type to cast the query kg nodes to + "cde": { + # Parse nodes matching criteria in kg + "node_type": "biolink:Publication", + "curie_prefix": "HEALCDE", + # list of attributes that are lists to be casted to strings + "list_field_choose_first": ["files"], + "attribute_mapping": { + # "DugElement Attribute" : "KG Node attribute" + "name": "name", + "desc": "summary", + "collection_name": "cde_category", + "collection_id": "cde_category", + "action": "files", + }, } } - }) + ) - concept_expander: dict = field(default_factory=lambda: { - "url": "https://tranql-dev.renci.org/tranql/query?dynamic_id_resolution=true&asynchronous=false", - "min_tranql_score": 0.0 - }) + concept_expander: dict = field( + default_factory=lambda: { + "url": "https://tranql-dev.renci.org/tranql/query?dynamic_id_resolution=true&asynchronous=false", + "min_tranql_score": 0.0, + } + ) # List of ontology types that can be used even if they fail normalization - ontology_greenlist: list = field(default_factory=lambda: ["PATO", "CHEBI", "MONDO", "UBERON", "HP", "MESH", "UMLS"]) + ontology_greenlist: list = field( + default_factory=lambda: [ + "PATO", + "CHEBI", + "MONDO", + "UBERON", + "HP", + "MESH", + "UMLS", + ] + ) @classmethod def from_env(cls): @@ -107,7 +136,7 @@ def from_env(cls): "elastic_password": "ELASTIC_PASSWORD", "redis_host": "REDIS_HOST", "redis_port": "REDIS_PORT", - "redis_password": "REDIS_PASSWORD" + "redis_password": "REDIS_PASSWORD", } kwargs = {} diff --git a/src/dug/core/annotators/__init__.py b/src/dug/core/annotators/__init__.py index 1a58c40..903825b 100644 --- a/src/dug/core/annotators/__init__.py +++ b/src/dug/core/annotators/__init__.py @@ -14,8 +14,8 @@ @hookimpl def define_annotators(annotator_dict: Dict[str, Annotator]): - annotator_dict["annotator-monarch"] = build_monarch_annotator() - annotator_dict["annotator-sapbert"] = build_sapbert_annotator() + annotator_dict["annotator-monarch"] = build_monarch_annotator("annotator-monarch") + annotator_dict["annotator-sapbert"] = build_sapbert_annotator("annotator-sapbert") class AnnotatorNotFoundException(Exception): @@ -29,6 +29,7 @@ def get_annotator(hook, annotator_name) -> Annotator: hook.define_annotators(annotator_dict=available_annotators) annotator = available_annotators.get(annotator_name.lower()) if annotator is not None: + logger.info(f'Annotating with {annotator}') return annotator err_msg = f"Cannot find annotator of type '{annotator_name}'\n" \ @@ -36,21 +37,22 @@ def get_annotator(hook, annotator_name) -> Annotator: logger.error(err_msg) raise AnnotatorNotFoundException(err_msg) -def build_monarch_annotator(): +def build_monarch_annotator(annotate_type): config = Config.from_env() annotator = AnnotateMonarch( normalizer=DefaultNormalizer(**config.normalizer), synonym_finder=DefaultSynonymFinder(**config.synonym_service), config=config, + **config.annotator_args[annotate_type] ) - return annotator -def build_sapbert_annotator(): +def build_sapbert_annotator(annotate_type): config = Config.from_env() annotator = AnnotateSapbert( normalizer=DefaultNormalizer(**config.normalizer), synonym_finder=DefaultSynonymFinder(**config.synonym_service), + **config.annotator_args[annotate_type] ) return annotator diff --git a/src/dug/core/annotators/_base.py b/src/dug/core/annotators/_base.py index c725bff..ea30b4d 100644 --- a/src/dug/core/annotators/_base.py +++ b/src/dug/core/annotators/_base.py @@ -13,7 +13,6 @@ logging.getLogger("requests").setLevel(logging.WARNING) logging.getLogger("urllib3").setLevel(logging.WARNING) - class DugIdentifier: """ The Dug Identifier is the core piece of information about a concept that produced from a request to an annotator based on a some original source of data. \n The information that is being stored is mostly meant to support the Monarch API but should be adjusted accordingly to suit new Annotators needs in the future. diff --git a/src/dug/core/annotators/monarch_annotator.py b/src/dug/core/annotators/monarch_annotator.py index 1c67f40..e50e317 100644 --- a/src/dug/core/annotators/monarch_annotator.py +++ b/src/dug/core/annotators/monarch_annotator.py @@ -21,9 +21,10 @@ def __init__( synonym_finder, config, ontology_greenlist=[], + **kwargs ): - self.annotatorUrl = config.annotator['url'] + self.annotatorUrl = kwargs['url'] self.normalizer = normalizer self.synonym_finder = synonym_finder self.ontology_greenlist = ontology_greenlist @@ -42,7 +43,6 @@ def __init__( self.stopwords = stopwords def __call__(self, text, http_session) -> List[DugIdentifier]: - # Preprocess text (debraviate, remove stopwords, etc.) text = self.preprocess_text(text) diff --git a/src/dug/core/annotators/sapbert_annotator.py b/src/dug/core/annotators/sapbert_annotator.py index 7c2fa81..a677140 100644 --- a/src/dug/core/annotators/sapbert_annotator.py +++ b/src/dug/core/annotators/sapbert_annotator.py @@ -22,9 +22,10 @@ def __init__( normalizer, synonym_finder, ontology_greenlist=[], + **kwargs ): - self.classificationUrl = "https://med-nemo.apps.renci.org/annotate/" - self.annotatorUrl = "https://babel-sapbert.apps.renci.org/annotate/" + self.classificationUrl = kwargs['classificationUrl'] + self.annotatorUrl = kwargs['annotatorUrl'] self.normalizer = normalizer self.synonym_finder = synonym_finder self.ontology_greenlist = ontology_greenlist From 4eb6d2e919bdc903cc9b7d3b585020bd52d8fd7b Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Thu, 4 Jan 2024 15:02:31 -0500 Subject: [PATCH 23/46] pass down config , no global access --- src/dug/core/__init__.py | 2 +- src/dug/core/annotators/__init__.py | 18 +++++++++--------- src/dug/hookspecs.py | 3 ++- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/src/dug/core/__init__.py b/src/dug/core/__init__.py index 9fca7ce..effcb7b 100644 --- a/src/dug/core/__init__.py +++ b/src/dug/core/__init__.py @@ -63,7 +63,7 @@ def crawl(self, target_name: str, parser_type: str, annotator_type: str, element pm = get_plugin_manager() parser = get_parser(pm.hook, parser_type) - annotator = get_annotator(pm.hook, annotator_type) + annotator = get_annotator(pm.hook, annotator_type, self._factory.config) targets = get_targets(target_name) for target in targets: diff --git a/src/dug/core/annotators/__init__.py b/src/dug/core/annotators/__init__.py index 903825b..acb7823 100644 --- a/src/dug/core/annotators/__init__.py +++ b/src/dug/core/annotators/__init__.py @@ -13,20 +13,20 @@ hookimpl = pluggy.HookimplMarker("dug") @hookimpl -def define_annotators(annotator_dict: Dict[str, Annotator]): - annotator_dict["annotator-monarch"] = build_monarch_annotator("annotator-monarch") - annotator_dict["annotator-sapbert"] = build_sapbert_annotator("annotator-sapbert") +def define_annotators(annotator_dict: Dict[str, Annotator], config: Config): + annotator_dict["annotator-monarch"] = build_monarch_annotator("annotator-monarch", config=config) + annotator_dict["annotator-sapbert"] = build_sapbert_annotator("annotator-sapbert", config=config) class AnnotatorNotFoundException(Exception): ... -def get_annotator(hook, annotator_name) -> Annotator: +def get_annotator(hook, annotator_name, config: Config) -> Annotator: """Get the annotator from all annotators registered via the define_annotators hook""" available_annotators = {} - hook.define_annotators(annotator_dict=available_annotators) + hook.define_annotators(annotator_dict=available_annotators, config=config) annotator = available_annotators.get(annotator_name.lower()) if annotator is not None: logger.info(f'Annotating with {annotator}') @@ -37,8 +37,8 @@ def get_annotator(hook, annotator_name) -> Annotator: logger.error(err_msg) raise AnnotatorNotFoundException(err_msg) -def build_monarch_annotator(annotate_type): - config = Config.from_env() +def build_monarch_annotator(annotate_type: str, config: Config): + logger.info(f"Building Monarch annotator with args: {config.annotator_args[annotate_type]}") annotator = AnnotateMonarch( normalizer=DefaultNormalizer(**config.normalizer), synonym_finder=DefaultSynonymFinder(**config.synonym_service), @@ -47,8 +47,8 @@ def build_monarch_annotator(annotate_type): ) return annotator -def build_sapbert_annotator(annotate_type): - config = Config.from_env() +def build_sapbert_annotator(annotate_type, config: Config): + logger.info(f"Building Sapbert annotator with args: {config.annotator_args[annotate_type]}") annotator = AnnotateSapbert( normalizer=DefaultNormalizer(**config.normalizer), synonym_finder=DefaultSynonymFinder(**config.synonym_service), diff --git a/src/dug/hookspecs.py b/src/dug/hookspecs.py index 96b984b..9687b15 100644 --- a/src/dug/hookspecs.py +++ b/src/dug/hookspecs.py @@ -4,6 +4,7 @@ from dug.core.parsers import Parser from dug.core.annotators import Annotator +from dug.config import Config hookspec = pluggy.HookspecMarker("dug") @@ -15,7 +16,7 @@ def define_parsers(parser_dict: Dict[str, Parser]): ... @hookspec -def define_annotators(annotator_dict: Dict[str, Annotator]): +def define_annotators(annotator_dict: Dict[str, Annotator], config: Config): """Defines what Annotators are available to Dug """ ... From 0147fae6231ca60dacde1bcf2746605475182cb9 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Thu, 4 Jan 2024 15:12:55 -0500 Subject: [PATCH 24/46] remove `-` from annotator names --- src/dug/cli.py | 2 +- src/dug/config.py | 6 +++--- src/dug/core/annotators/__init__.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/dug/cli.py b/src/dug/cli.py index 4fd5923..f211e3a 100755 --- a/src/dug/cli.py +++ b/src/dug/cli.py @@ -55,7 +55,7 @@ def get_argparser(): '-a', '--annotator', help='Annotator used to annotate identifiers in crawl file', dest="annotator_type", - default="annotator-monarch" + default="monarch" ) crawl_parser.add_argument( diff --git a/src/dug/config.py b/src/dug/config.py index 5f49e9e..92e404d 100644 --- a/src/dug/config.py +++ b/src/dug/config.py @@ -34,14 +34,14 @@ class Config: "stopwords": ["the"], } ) - annotator_type: str = "annotator-monarch" + annotator_type: str = "monarch" # Annotator config that will be passed to annotate.Annotator constructor annotator_args: dict = field( default_factory=lambda: { - "annotator-monarch": { + "monarch": { "url": "https://api.monarchinitiative.org/api/nlp/annotate/entities?min_length=4&longest_only=false&include_abbreviation=false&include_acronym=false&include_numbers=false&content=" }, - "annotator-sapbert": { + "sapbert": { "classificationUrl": "https://med-nemo.apps.renci.org/annotate/", "annotatorUrl": "https://babel-sapbert.apps.renci.org/annotate/", }, diff --git a/src/dug/core/annotators/__init__.py b/src/dug/core/annotators/__init__.py index acb7823..60b43df 100644 --- a/src/dug/core/annotators/__init__.py +++ b/src/dug/core/annotators/__init__.py @@ -14,8 +14,8 @@ @hookimpl def define_annotators(annotator_dict: Dict[str, Annotator], config: Config): - annotator_dict["annotator-monarch"] = build_monarch_annotator("annotator-monarch", config=config) - annotator_dict["annotator-sapbert"] = build_sapbert_annotator("annotator-sapbert", config=config) + annotator_dict["monarch"] = build_monarch_annotator("monarch", config=config) + annotator_dict["sapbert"] = build_sapbert_annotator("sapbert", config=config) class AnnotatorNotFoundException(Exception): From 80e35ae6c76825028a1bdade90a8d498f2e9df2d Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Thu, 4 Jan 2024 17:47:19 -0500 Subject: [PATCH 25/46] normalize args for sapbert so it becomes easier parsing from env --- src/dug/config.py | 4 ++-- src/dug/core/annotators/sapbert_annotator.py | 8 +++++-- src/dug/core/factory.py | 22 -------------------- 3 files changed, 8 insertions(+), 26 deletions(-) diff --git a/src/dug/config.py b/src/dug/config.py index 92e404d..b070cac 100644 --- a/src/dug/config.py +++ b/src/dug/config.py @@ -42,8 +42,8 @@ class Config: "url": "https://api.monarchinitiative.org/api/nlp/annotate/entities?min_length=4&longest_only=false&include_abbreviation=false&include_acronym=false&include_numbers=false&content=" }, "sapbert": { - "classificationUrl": "https://med-nemo.apps.renci.org/annotate/", - "annotatorUrl": "https://babel-sapbert.apps.renci.org/annotate/", + "classification_url": "https://med-nemo.apps.renci.org/annotate/", + "annotator_url": "https://babel-sapbert.apps.renci.org/annotate/", }, } ) diff --git a/src/dug/core/annotators/sapbert_annotator.py b/src/dug/core/annotators/sapbert_annotator.py index a677140..6f2c93a 100644 --- a/src/dug/core/annotators/sapbert_annotator.py +++ b/src/dug/core/annotators/sapbert_annotator.py @@ -24,8 +24,12 @@ def __init__( ontology_greenlist=[], **kwargs ): - self.classificationUrl = kwargs['classificationUrl'] - self.annotatorUrl = kwargs['annotatorUrl'] + self.classificationUrl = kwargs.get('classification_url') + self.annotatorUrl = kwargs.get('annotator_url') + if not self.classificationUrl: + raise TypeError('Classification url needs to be defined for sapbert annotator') + if not self.annotatorUrl: + raise TypeError('Annotator url needs to be defined for sapbert annotator') self.normalizer = normalizer self.synonym_finder = synonym_finder self.ontology_greenlist = ontology_greenlist diff --git a/src/dug/core/factory.py b/src/dug/core/factory.py index 6037f97..0bedab2 100644 --- a/src/dug/core/factory.py +++ b/src/dug/core/factory.py @@ -4,12 +4,6 @@ from requests_cache import CachedSession import dug.core.tranql as tql -# from dug.core.annotate import (DugAnnotator, -# # Annotator, -# Normalizer, -# Preprocessor, -# SynonymFinder, -# ConceptExpander) from dug.core.concept_expander import ConceptExpander from dug.config import Config as DugConfig, TRANQL_SOURCE from dug.core.crawler import Crawler @@ -53,22 +47,6 @@ def build_crawler(self, target, parser: Parser, annotator: Annotator, element_ty return crawler - # def build_annotator(self) -> Annotator: - - # preprocessor = Preprocessor(**self.config.preprocessor) - # annotator = Annotate(**self.config.annotator) - # normalizer = Normalizer(**self.config.normalizer) - # synonym_finder = SynonymFinder(**self.config.synonym_service) - - # annotator = Annotator( - # preprocessor=preprocessor, - # annotator=annotator, - # normalizer=normalizer, - # synonym_finder=synonym_finder - # ) - - # return annotator - def build_tranqlizer(self) -> ConceptExpander: return ConceptExpander(**self.config.concept_expander) From 096ba478f60c91d96dc26ead81d07da47c07b15c Mon Sep 17 00:00:00 2001 From: "Michael T. Bacon" Date: Wed, 10 Jan 2024 19:26:55 -0500 Subject: [PATCH 26/46] Sorted lists for json serialization for parser and annotator outputs --- src/dug/core/annotators/_base.py | 44 +++++++++++++++++++++++++------- src/dug/core/parsers/_base.py | 26 +++++++++++++++++-- 2 files changed, 59 insertions(+), 11 deletions(-) diff --git a/src/dug/core/annotators/_base.py b/src/dug/core/annotators/_base.py index c725bff..cc4fc18 100644 --- a/src/dug/core/annotators/_base.py +++ b/src/dug/core/annotators/_base.py @@ -15,15 +15,26 @@ class DugIdentifier: - """ The Dug Identifier is the core piece of information about a concept that produced from a request to an annotator based on a some original source of data. - \n The information that is being stored is mostly meant to support the Monarch API but should be adjusted accordingly to suit new Annotators needs in the future. + """Core information about a concept, produced from annotator request + + The Dug Identifier is the core piece of information about a concept that + produced from a request to an annotator based on a some original source of + data. + + \n The information that is being stored is mostly meant to support the + Monarch API but should be adjusted accordingly to suit new Annotators needs + in the future. \n The information that will be needed for all annotators are: \n id: The CURIE identifier \n label: The CURIE identifier \n description: The CURIE identifier - \n When there is another supported Normalizer it will be seperated into a separate plugin like annotator. + \n When there is another supported Normalizer it will be seperated into a + separate plugin like annotator. """ + def __init__(self, id, label, types=None, search_text="", description=""): + "custom init stores parameters to initial values" + self.id = id self.label = label self.description = description @@ -40,12 +51,12 @@ def id_type(self): return self.id.split(":")[0] def add_search_text(self, text): - # Add text only if it's unique and if not empty string + "Add text only if it's unique and if not empty string" if text and text not in self.search_text: self.search_text.append(text) def get_searchable_dict(self): - # Return a version of the identifier compatible with what's in ElasticSearch + "Return version of identifier compatible with what's in ElasticSearch" es_ident = { "id": self.id, "label": self.label, @@ -56,7 +67,13 @@ def get_searchable_dict(self): return es_ident def jsonable(self): - return self.__dict__ + "Output pickleable object (used by utils.complex_handler)" + outdict = self.__dict__ + + outdict['search_text'] = sorted(self.search_text) + outdict['synonyms'] = sorted(self.synonyms) + + return outdict def __str__(self): return json.dumps(self.__dict__, indent=2, default=utils.complex_handler) @@ -82,9 +99,18 @@ def __call__(self, value: Input, http_session: Session) -> Output: class DefaultNormalizer(): - """ After annotation there must be a Noramlizing step to collasce equivalent concepts into one official concept. This is a needed step for the knowledge graph to map between different concepts. - \n The reason why this class in integrated into the annotators.py is because currently there is only one supported Normalizer through the NCATs Translator API. - \n When there is another supported Normalizer it will be seperated into a separate plugin like annotator. + """Default concept normalizer class + + After annotation there must be a Normalizing step to collasce equivalent + concepts into one official concept. This is a needed step for the knowledge + graph to map between different concepts. + + The reason why this class in integrated into the annotators.py is because + currently there is only one supported Normalizer through the NCATs + Translator API. + + When there is another supported Normalizer it will be seperated into a + separate plugin like annotator. """ def __init__(self, url): diff --git a/src/dug/core/parsers/_base.py b/src/dug/core/parsers/_base.py index acfc5bb..f827923 100644 --- a/src/dug/core/parsers/_base.py +++ b/src/dug/core/parsers/_base.py @@ -29,7 +29,18 @@ def add_concept(self, concept): self.concepts[concept.id] = concept def jsonable(self): - return self.__dict__ + """Output a pickleable object + + used by dug.utils.complex_handler. Because search_terms and + optional_terms are considered unsorted lists by the parsers but will be + treated as sorted lists by python, sorting the lists before output + prevents changes in ordering from being treated as a change in output by + incremental change detection. + """ + outdict = self.__dict__ + outdict['search_terms'] = sorted(self.search_terms) + outdict['optional_terms'] = sorted(self.optional_terms) + return outdict def get_searchable_dict(self): # Translate DugElement to ES-style dict @@ -132,7 +143,18 @@ def get_searchable_dict(self): return es_conc def jsonable(self): - return self.__dict__ + """Output a pickleable object + + used by dug.utils.complex_handler. Because search_terms and + optional_terms are considered unsorted lists by the parsers but will be + treated as sorted lists by python, sorting the lists before output + prevents changes in ordering from being treated as a change in output by + incremental change detection. + """ + outdict = self.__dict__ + outdict['search_terms'] = sorted(self.search_terms) + outdict['optional_terms'] = sorted(self.optional_terms) + return outdict def __str__(self): return json.dumps(self.__dict__, indent=2, default=utils.complex_handler) From 0b7b51fba6bb1b53c2260bd9a7703b675ba95881 Mon Sep 17 00:00:00 2001 From: "Michael T. Bacon" Date: Tue, 16 Jan 2024 15:34:00 -0500 Subject: [PATCH 27/46] Reverted jsonable, sorted lists on assignment and change, rather than on json output --- src/dug/core/annotators/_base.py | 10 +++----- src/dug/core/parsers/_base.py | 40 ++++++++------------------------ 2 files changed, 13 insertions(+), 37 deletions(-) diff --git a/src/dug/core/annotators/_base.py b/src/dug/core/annotators/_base.py index cc4fc18..cb4c7fd 100644 --- a/src/dug/core/annotators/_base.py +++ b/src/dug/core/annotators/_base.py @@ -41,7 +41,7 @@ def __init__(self, id, label, types=None, search_text="", description=""): if types is None: types = [] self.types = types - self.search_text = [search_text] if search_text else [] + self.search_text = sorted([search_text]) if search_text else [] self.equivalent_identifiers = [] self.synonyms = [] self.purl = "" @@ -53,7 +53,7 @@ def id_type(self): def add_search_text(self, text): "Add text only if it's unique and if not empty string" if text and text not in self.search_text: - self.search_text.append(text) + self.search_text = sorted(self.search_text + [text]) def get_searchable_dict(self): "Return version of identifier compatible with what's in ElasticSearch" @@ -68,12 +68,8 @@ def get_searchable_dict(self): def jsonable(self): "Output pickleable object (used by utils.complex_handler)" - outdict = self.__dict__ + return self.__dict__ - outdict['search_text'] = sorted(self.search_text) - outdict['synonyms'] = sorted(self.synonyms) - - return outdict def __str__(self): return json.dumps(self.__dict__, indent=2, default=utils.complex_handler) diff --git a/src/dug/core/parsers/_base.py b/src/dug/core/parsers/_base.py index f827923..a5262e5 100644 --- a/src/dug/core/parsers/_base.py +++ b/src/dug/core/parsers/_base.py @@ -29,18 +29,8 @@ def add_concept(self, concept): self.concepts[concept.id] = concept def jsonable(self): - """Output a pickleable object - - used by dug.utils.complex_handler. Because search_terms and - optional_terms are considered unsorted lists by the parsers but will be - treated as sorted lists by python, sorting the lists before output - prevents changes in ordering from being treated as a change in output by - incremental change detection. - """ - outdict = self.__dict__ - outdict['search_terms'] = sorted(self.search_terms) - outdict['optional_terms'] = sorted(self.optional_terms) - return outdict + """Output a pickleable object""" + return self.__dict__ def get_searchable_dict(self): # Translate DugElement to ES-style dict @@ -66,7 +56,7 @@ def set_search_terms(self): concept.set_search_terms() search_terms.extend(concept.search_terms) search_terms.append(concept.name) - search_terms = list(set(search_terms)) + search_terms = sorted(list(set(search_terms))) self.search_terms = search_terms def set_optional_terms(self): @@ -74,7 +64,7 @@ def set_optional_terms(self): for concept_id, concept in self.concepts.items(): concept.set_optional_terms() optional_terms.extend(concept.optional_terms) - optional_terms = list(set(optional_terms)) + optional_terms = sorted(list(set(optional_terms))) self.optional_terms = optional_terms def __str__(self): @@ -110,15 +100,15 @@ def add_kg_answer(self, answer, query_name): self.kg_answers[answer_id] = answer def clean(self): - self.search_terms = list(set(self.search_terms)) - self.optional_terms = list(set(self.optional_terms)) + self.search_terms = sorted(list(set(self.search_terms))) + self.optional_terms = sorted(list(set(self.optional_terms))) def set_search_terms(self): # Traverse set of identifiers to determine set of search terms search_terms = self.search_terms for ident_id, ident in self.identifiers.items(): search_terms.extend(ident.search_text + ident.synonyms) - self.search_terms = list(set(search_terms)) + self.search_terms = sorted(list(set(search_terms))) def set_optional_terms(self): # Traverse set of knowledge graph answers to determine set of optional search terms @@ -126,7 +116,7 @@ def set_optional_terms(self): for kg_id, kg_answer in self.kg_answers.items(): optional_terms += kg_answer.get_node_names() optional_terms += kg_answer.get_node_synonyms() - self.optional_terms = list(set(optional_terms)) + self.optional_terms = sorted(list(set(optional_terms))) def get_searchable_dict(self): # Translate DugConcept into Elastic-Compatible Concept @@ -143,18 +133,8 @@ def get_searchable_dict(self): return es_conc def jsonable(self): - """Output a pickleable object - - used by dug.utils.complex_handler. Because search_terms and - optional_terms are considered unsorted lists by the parsers but will be - treated as sorted lists by python, sorting the lists before output - prevents changes in ordering from being treated as a change in output by - incremental change detection. - """ - outdict = self.__dict__ - outdict['search_terms'] = sorted(self.search_terms) - outdict['optional_terms'] = sorted(self.optional_terms) - return outdict + """Output a pickleable object""" + return self.__dict__ def __str__(self): return json.dumps(self.__dict__, indent=2, default=utils.complex_handler) From 0bb708584203ae1ff653cdde9ecc64fd188278cf Mon Sep 17 00:00:00 2001 From: "Michael T. Bacon" Date: Wed, 17 Jan 2024 14:35:34 -0500 Subject: [PATCH 28/46] Trying bumps in Docker base images --- Dockerfile | 4 ++-- docker-compose.yaml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index c009bc5..01faa15 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,7 +3,7 @@ # A container for the core semantic-search capability. # ###################################################### -FROM python:3.12.0-alpine3.18 +FROM python:3.12.1-alpine3.18 # Install required packages RUN apk update && \ @@ -31,4 +31,4 @@ RUN make install RUN make install.dug # Run it -ENTRYPOINT dug \ No newline at end of file +ENTRYPOINT dug diff --git a/docker-compose.yaml b/docker-compose.yaml index 8e59bd5..ccc22a3 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -56,7 +56,7 @@ services: ## ################################################################################# elasticsearch: - image: docker.elastic.co/elasticsearch/elasticsearch:8.5.2 + image: docker.elastic.co/elasticsearch/elasticsearch:8.11.3 networks: - dug-network environment: @@ -76,7 +76,7 @@ services: ## ################################################################################# redis: - image: 'redis/redis-stack:6.2.4-v2' + image: 'redis/redis-stack:6.2.14' networks: - dug-network environment: From ef0b74dde9646212804aac5899af3b4edaf3eeda Mon Sep 17 00:00:00 2001 From: "Michael T. Bacon" Date: Wed, 17 Jan 2024 17:10:02 -0500 Subject: [PATCH 29/46] Adding jsonpickle to requirements.txt --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index bac13a6..566a6b5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,6 +6,7 @@ elasticsearch[async]==8.5.2 gunicorn itsdangerous Jinja2 +jsonpickle jsonschema MarkupSafe ormar @@ -26,4 +27,4 @@ click httpx linkml-runtime==1.6.0 bmt==1.1.0 -urllib3 \ No newline at end of file +urllib3 From ebf9078f731d5e6fd3c566ae92058608b76c8b28 Mon Sep 17 00:00:00 2001 From: "Michael T. Bacon" Date: Mon, 22 Jan 2024 11:55:31 -0500 Subject: [PATCH 30/46] Moving required python version back to 3.11. --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index b551ef3..75fe4d2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -17,7 +17,7 @@ classifiers = package_dir = = src packages = find: -python_requires = >=3.12 +python_requires = >=3.11 include_package_data = true install_requires = elasticsearch==8.5.2 From 56b85df6fb01e53efabb93815a12733b705105a8 Mon Sep 17 00:00:00 2001 From: "Michael T. Bacon" Date: Mon, 22 Jan 2024 12:06:04 -0500 Subject: [PATCH 31/46] Changing image back to 3.11 as well --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 01faa15..4f21b36 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,7 +3,7 @@ # A container for the core semantic-search capability. # ###################################################### -FROM python:3.12.1-alpine3.18 +FROM python:3.11-alpine # Install required packages RUN apk update && \ From 8834423a0234b67267691a29f948afd3da51d2b8 Mon Sep 17 00:00:00 2001 From: "Michael T. Bacon" Date: Mon, 22 Jan 2024 13:45:14 -0500 Subject: [PATCH 32/46] Backing up redis image change to see if I can get dug auto-build to work again --- docker-compose.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker-compose.yaml b/docker-compose.yaml index ccc22a3..8e8d27d 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -76,7 +76,7 @@ services: ## ################################################################################# redis: - image: 'redis/redis-stack:6.2.14' + image: 'redis/redis-stack:6.2.4-v2' networks: - dug-network environment: From 022f6988418e8fdfdc55e59c4d316209261261a1 Mon Sep 17 00:00:00 2001 From: Joshua Seals Date: Tue, 23 Jan 2024 04:48:34 -0500 Subject: [PATCH 33/46] Build all branches for testing, pushing only to docker. Fix tag bypass for build-push-release action --- .github/workflows/build-push-release.yml | 2 +- .github/workflows/code-checks.yml | 83 +++++++++++++----------- 2 files changed, 45 insertions(+), 40 deletions(-) diff --git a/.github/workflows/build-push-release.yml b/.github/workflows/build-push-release.yml index f23dc15..a383cef 100644 --- a/.github/workflows/build-push-release.yml +++ b/.github/workflows/build-push-release.yml @@ -18,7 +18,7 @@ on: - .dockerignore - .githooks tags-ignore: - - 'v[0-9]+.[0-9]+.*' + - '*' jobs: build-push-release: runs-on: ubuntu-latest diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index 0dc8428..401c24c 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -66,45 +66,6 @@ jobs: # flake8 --ignore=E,W --exit-zero . continue-on-error: true -# ############################## build-vuln-test ############################## - # build-vuln-test: - # # needs: flake8-linter - # runs-on: ubuntu-latest - # steps: - # - uses: actions/checkout@v3 - - # - name: Set up Docker Buildx - # uses: docker/setup-buildx-action@v3 - # with: - # driver-opts: | - # network=host - - # - name: Login to DockerHub - # uses: docker/login-action@v3 - # with: - # username: ${{ secrets.DOCKERHUB_USERNAME }} - # password: ${{ secrets.DOCKERHUB_TOKEN }} - # logout: true - - # # Notes on Cache: - # # https://docs.docker.com/build/ci/github-actions/examples/#inline-cache - # - name: Build Container - # uses: docker/build-push-action@v5 - # with: - # context: . - # push: false - # load: true - # tag: ${{ github.repository }}:vuln-test - # cache-from: type=registry,ref=${{ github.repository }}:buildcache - # cache-to: type=registry,ref=${{ github.repository }}:buildcache,mode=max - # ####### Run for Fidelity ###### - # - name: Run Trivy vulnerability scanner - # uses: aquasecurity/trivy-action@master - # with: - # image-ref: '${{ github.repository }}:vuln-test' - # severity: 'CRITICAL,HIGH' - # exit-code: '1' - ################################### PYTEST ################################### pytest: runs-on: ubuntu-latest @@ -145,3 +106,47 @@ jobs: - name: Test with Bandit run: | bandit -r src -n3 -lll + +############################## test-image-build ############################## + test-image-build: + runs-on: ubuntu-latest + # if: ${{ github.actor == 'dependabot[bot]' }} + steps: + - uses: actions/checkout@v3 + + - name: Set short git commit SHA + id: vars + run: | + echo "short_sha=$(git rev-parse --short ${{ github.sha }})" >> $GITHUB_OUTPUT + # https://github.blog/changelog/2022-10-11-github-actions-deprecating-save-state-and-set-output-commands/ + + - name: Confirm git commit SHA output + run: echo ${{ steps.vars.outputs.short_sha }} + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + logout: true + + - name: Parse Github Reference Name + id: branch + run: | + REF=${{ github.ref_name }} + echo "GHR=${REF%/*}" >> $GITHUB_OUTPUT + + # Notes on Cache: + # https://docs.docker.com/build/ci/github-actions/examples/#inline-cache + - name: Build Container + uses: docker/build-push-action@v5 + with: + context: . + push: true + tags: | + ${{ github.repository }}:test_${{ steps.branch.outputs.GHR }} + cache-from: type=registry,ref=${{ github.repository }}:buildcache + cache-to: type=registry,ref=${{ github.repository }}:buildcache,mode=max \ No newline at end of file From ef8b7211f41d553b2a1280055334baf81ec57402 Mon Sep 17 00:00:00 2001 From: Joshua Seals Date: Tue, 23 Jan 2024 05:15:50 -0500 Subject: [PATCH 34/46] Testing alpine to fix trivy error --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 4f21b36..f34afa2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,7 +3,7 @@ # A container for the core semantic-search capability. # ###################################################### -FROM python:3.11-alpine +FROM python:3.12.0-alpine3.18 # Install required packages RUN apk update && \ From e16a347439523879da62bf8e16bd8f17d97e0699 Mon Sep 17 00:00:00 2001 From: Joshua Seals Date: Tue, 23 Jan 2024 05:24:52 -0500 Subject: [PATCH 35/46] Vuln confirmed in image, new docker image test --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index f34afa2..e8d1ce2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,7 +3,7 @@ # A container for the core semantic-search capability. # ###################################################### -FROM python:3.12.0-alpine3.18 +FROM python:alpine3.19 # Install required packages RUN apk update && \ From 5be0195dc23d05477d0f6102182fb536e5eff14b Mon Sep 17 00:00:00 2001 From: Joshua Seals Date: Tue, 23 Jan 2024 05:30:13 -0500 Subject: [PATCH 36/46] Is buildcache causing trivy failures? --- .github/workflows/trivy-pr-scan.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/trivy-pr-scan.yml b/.github/workflows/trivy-pr-scan.yml index 19f86e1..8d14372 100644 --- a/.github/workflows/trivy-pr-scan.yml +++ b/.github/workflows/trivy-pr-scan.yml @@ -45,8 +45,8 @@ jobs: push: false load: true tags: ${{ github.repository }}:vuln-test - cache-from: type=registry,ref=${{ github.repository }}:buildcache - cache-to: type=registry,ref=${{ github.repository }}:buildcache,mode=max + # cache-from: type=registry,ref=${{ github.repository }}:buildcache + # cache-to: type=registry,ref=${{ github.repository }}:buildcache,mode=max # We will not be concerned with Medium and Low vulnerabilities - name: Run Trivy vulnerability scanner From d17578db8ce6e9be77ff45b07796ddc8e23a709e Mon Sep 17 00:00:00 2001 From: Joshua Seals Date: Tue, 23 Jan 2024 05:42:58 -0500 Subject: [PATCH 37/46] Re-enabling cache after testing --- .github/workflows/trivy-pr-scan.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/trivy-pr-scan.yml b/.github/workflows/trivy-pr-scan.yml index 8d14372..19f86e1 100644 --- a/.github/workflows/trivy-pr-scan.yml +++ b/.github/workflows/trivy-pr-scan.yml @@ -45,8 +45,8 @@ jobs: push: false load: true tags: ${{ github.repository }}:vuln-test - # cache-from: type=registry,ref=${{ github.repository }}:buildcache - # cache-to: type=registry,ref=${{ github.repository }}:buildcache,mode=max + cache-from: type=registry,ref=${{ github.repository }}:buildcache + cache-to: type=registry,ref=${{ github.repository }}:buildcache,mode=max # We will not be concerned with Medium and Low vulnerabilities - name: Run Trivy vulnerability scanner From d1ff3c966f8fcfa3648e34b1dfe7754093bee63a Mon Sep 17 00:00:00 2001 From: Joshua Seals Date: Tue, 23 Jan 2024 05:53:22 -0500 Subject: [PATCH 38/46] Revert to older trivy relelase --- .github/workflows/trivy-pr-scan.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/trivy-pr-scan.yml b/.github/workflows/trivy-pr-scan.yml index 19f86e1..83f58f7 100644 --- a/.github/workflows/trivy-pr-scan.yml +++ b/.github/workflows/trivy-pr-scan.yml @@ -50,7 +50,7 @@ jobs: # We will not be concerned with Medium and Low vulnerabilities - name: Run Trivy vulnerability scanner - uses: aquasecurity/trivy-action@master + uses: aquasecurity/trivy-action@v0.16.0 with: image-ref: '${{ github.repository }}:vuln-test' format: 'sarif' From 96f7338f6977b88531692eebaa818f14cd07e435 Mon Sep 17 00:00:00 2001 From: Joshua Seals Date: Tue, 23 Jan 2024 05:54:25 -0500 Subject: [PATCH 39/46] trivy scan update --- .github/workflows/trivy-pr-scan.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/trivy-pr-scan.yml b/.github/workflows/trivy-pr-scan.yml index 83f58f7..19f86e1 100644 --- a/.github/workflows/trivy-pr-scan.yml +++ b/.github/workflows/trivy-pr-scan.yml @@ -50,7 +50,7 @@ jobs: # We will not be concerned with Medium and Low vulnerabilities - name: Run Trivy vulnerability scanner - uses: aquasecurity/trivy-action@v0.16.0 + uses: aquasecurity/trivy-action@master with: image-ref: '${{ github.repository }}:vuln-test' format: 'sarif' From 5bee00d4f066eb55e1a29310cca8353aa96cda34 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Wed, 24 Jan 2024 17:06:19 -0500 Subject: [PATCH 40/46] adding pytest asyncio --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index bac13a6..531f5ab 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,6 +13,7 @@ mistune pluggy pyrsistent pytest +pytest-asyncio pytz PyYAML requests From 9cb89cab9f2147ce344b01b45252e6a3d985819d Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Wed, 24 Jan 2024 17:26:24 -0500 Subject: [PATCH 41/46] fix tests --- tests/unit/mocks/data/mock_config.py | 16 +++++++++++++--- tests/unit/test_annotators.py | 2 +- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/tests/unit/mocks/data/mock_config.py b/tests/unit/mocks/data/mock_config.py index 27ca191..d70f8a3 100644 --- a/tests/unit/mocks/data/mock_config.py +++ b/tests/unit/mocks/data/mock_config.py @@ -13,9 +13,19 @@ class MockConfig: }) # Annotator config that will be passed to annotate.Annotator constructor - annotator: dict = field(default_factory=lambda: { - "url": "http://annotator.api/?content=" - }) + annotator_type: str = "monarch" + + annotator_args: dict = field( + default_factory=lambda: { + "monarch": { + "url": "http://annotator.api/?content=" + }, + "sapbert": { + "classification_url": "http://classifier.api/annotate/", + "annotator_url": "http://entity-link.api/annotate/", + }, + } + ) # Normalizer config that will be passed to annotate.Normalizer constructor normalizer: dict = field(default_factory=lambda: { diff --git a/tests/unit/test_annotators.py b/tests/unit/test_annotators.py index c1702ee..2c7bde0 100644 --- a/tests/unit/test_annotators.py +++ b/tests/unit/test_annotators.py @@ -29,7 +29,7 @@ def test_annotator(annotator_api): synonym_finder = DefaultSynonymFinder(cfg.synonym_service) annotator = AnnotateMonarch( - normalizer=normalizer, synonym_finder=synonym_finder, config=cfg + normalizer=normalizer, synonym_finder=synonym_finder, config=cfg.annotator_args ) text = "heart attack" identifiers: List[DugIdentifier] = annotator.annotate_text( From 64f3cb6de1741d2ced7f0ce68b94d0d6499ee2ea Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Wed, 24 Jan 2024 17:34:19 -0500 Subject: [PATCH 42/46] fix annotator init --- tests/integration/test_annotators.py | 4 ++-- tests/unit/test_annotators.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/integration/test_annotators.py b/tests/integration/test_annotators.py index 8004d0d..a9778bf 100644 --- a/tests/integration/test_annotators.py +++ b/tests/integration/test_annotators.py @@ -28,7 +28,7 @@ def test_monarch_annotation_full( synonym_finder = DefaultSynonymFinder(**cfg.synonym_service) annotator = AnnotateMonarch( - normalizer=normalizer, synonym_finder=synonym_finder, config=cfg + normalizer=normalizer, synonym_finder=synonym_finder, **cfg.annotator_args["monarch"] ) input_text = "heart attack" @@ -95,7 +95,7 @@ def test_sapbert_annotation_full( normalizer = DefaultNormalizer(**cfg.normalizer) synonym_finder = DefaultSynonymFinder(**cfg.synonym_service) - annotator = AnnotateSapbert(normalizer=normalizer, synonym_finder=synonym_finder) + annotator = AnnotateSapbert(normalizer=normalizer, synonym_finder=synonym_finder, **cfg.annotator_args["sapbert"]) input_text = "Have you ever had a heart attack?" # Fetch Classifiers diff --git a/tests/unit/test_annotators.py b/tests/unit/test_annotators.py index 2c7bde0..5ea804d 100644 --- a/tests/unit/test_annotators.py +++ b/tests/unit/test_annotators.py @@ -29,7 +29,7 @@ def test_annotator(annotator_api): synonym_finder = DefaultSynonymFinder(cfg.synonym_service) annotator = AnnotateMonarch( - normalizer=normalizer, synonym_finder=synonym_finder, config=cfg.annotator_args + normalizer=normalizer, synonym_finder=synonym_finder, kwargs=cfg.annotator_args["monarch"] ) text = "heart attack" identifiers: List[DugIdentifier] = annotator.annotate_text( From 15cccfe56d30111be2b26e96b743edf228eac7a5 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Wed, 24 Jan 2024 17:46:44 -0500 Subject: [PATCH 43/46] fix all the tests --- tests/integration/conftest.py | 2 +- tests/integration/mocks/mock_config.py | 17 ++++++++++++++--- tests/integration/test_annotators.py | 2 +- tests/unit/test_annotators.py | 2 +- 4 files changed, 17 insertions(+), 6 deletions(-) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index b671e3f..50f5787 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -122,7 +122,7 @@ def token_classifier_api(): def sapbert_annotator_api(): return MockApiService( urls={ - "https://babel-sapbert.apps.renci.org/annotate/": [ + "https://med-nemo.apps.renci.org/annotate/": [ json.dumps( [ { diff --git a/tests/integration/mocks/mock_config.py b/tests/integration/mocks/mock_config.py index 27ca191..82bcd1b 100644 --- a/tests/integration/mocks/mock_config.py +++ b/tests/integration/mocks/mock_config.py @@ -12,10 +12,21 @@ class MockConfig: "stopwords": ["the"] }) + # Annotator config that will be passed to annotate.Annotator constructor - annotator: dict = field(default_factory=lambda: { - "url": "http://annotator.api/?content=" - }) + annotator_type: str = "monarch" + + annotator_args: dict = field( + default_factory=lambda: { + "monarch": { + "url": "http://annotator.api/?content=" + }, + "sapbert": { + "classification_url": "https://med-nemo.apps.renci.org/annotate/", + "annotator_url": "https://med-nemo.apps.renci.org/annotate/", + }, + } + ) # Normalizer config that will be passed to annotate.Normalizer constructor normalizer: dict = field(default_factory=lambda: { diff --git a/tests/integration/test_annotators.py b/tests/integration/test_annotators.py index a9778bf..eecfd1e 100644 --- a/tests/integration/test_annotators.py +++ b/tests/integration/test_annotators.py @@ -28,7 +28,7 @@ def test_monarch_annotation_full( synonym_finder = DefaultSynonymFinder(**cfg.synonym_service) annotator = AnnotateMonarch( - normalizer=normalizer, synonym_finder=synonym_finder, **cfg.annotator_args["monarch"] + normalizer=normalizer, synonym_finder=synonym_finder, config=cfg, **cfg.annotator_args["monarch"] ) input_text = "heart attack" diff --git a/tests/unit/test_annotators.py b/tests/unit/test_annotators.py index 5ea804d..830a140 100644 --- a/tests/unit/test_annotators.py +++ b/tests/unit/test_annotators.py @@ -29,7 +29,7 @@ def test_annotator(annotator_api): synonym_finder = DefaultSynonymFinder(cfg.synonym_service) annotator = AnnotateMonarch( - normalizer=normalizer, synonym_finder=synonym_finder, kwargs=cfg.annotator_args["monarch"] + normalizer=normalizer, synonym_finder=synonym_finder, config=cfg , **cfg.annotator_args["monarch"] ) text = "heart attack" identifiers: List[DugIdentifier] = annotator.annotate_text( From f3d94110558738242fdfed4eefaf4e87558d4ecf Mon Sep 17 00:00:00 2001 From: "Michael T. Bacon" Date: Wed, 24 Jan 2024 17:52:52 -0500 Subject: [PATCH 44/46] Forced Python 3.11 --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index e8d1ce2..c7e9bc3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,7 +3,7 @@ # A container for the core semantic-search capability. # ###################################################### -FROM python:alpine3.19 +FROM python:3.11-alpine3.19 # Install required packages RUN apk update && \ From d7257dfae5cbb973609562670cba5d1be048ceb7 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Wed, 24 Jan 2024 17:52:59 -0500 Subject: [PATCH 45/46] bump docker image version to 0 vuls --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index c009bc5..6147d76 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,7 +3,7 @@ # A container for the core semantic-search capability. # ###################################################### -FROM python:3.12.0-alpine3.18 +FROM python:3.12.1-alpine3.19 # Install required packages RUN apk update && \ From 275abcbacd42bba1ec5cf89869ab845b37776a65 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Wed, 24 Jan 2024 18:07:57 -0500 Subject: [PATCH 46/46] zero again 0_o --- Dockerfile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index e42083a..3980ddf 100644 --- a/Dockerfile +++ b/Dockerfile @@ -8,7 +8,10 @@ FROM python:3.12.1-alpine3.19 # Install required packages RUN apk update && \ - apk add g++ make + apk add g++ make + +#upgrade openssl \ +RUN apk add openssl=3.1.4-r4 RUN pip install --upgrade pip # Create a non-root user.