diff --git a/.github/workflows/build-push-release.yml b/.github/workflows/build-push-release.yml index f23dc15..a383cef 100644 --- a/.github/workflows/build-push-release.yml +++ b/.github/workflows/build-push-release.yml @@ -18,7 +18,7 @@ on: - .dockerignore - .githooks tags-ignore: - - 'v[0-9]+.[0-9]+.*' + - '*' jobs: build-push-release: runs-on: ubuntu-latest diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index 0dc8428..401c24c 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -66,45 +66,6 @@ jobs: # flake8 --ignore=E,W --exit-zero . continue-on-error: true -# ############################## build-vuln-test ############################## - # build-vuln-test: - # # needs: flake8-linter - # runs-on: ubuntu-latest - # steps: - # - uses: actions/checkout@v3 - - # - name: Set up Docker Buildx - # uses: docker/setup-buildx-action@v3 - # with: - # driver-opts: | - # network=host - - # - name: Login to DockerHub - # uses: docker/login-action@v3 - # with: - # username: ${{ secrets.DOCKERHUB_USERNAME }} - # password: ${{ secrets.DOCKERHUB_TOKEN }} - # logout: true - - # # Notes on Cache: - # # https://docs.docker.com/build/ci/github-actions/examples/#inline-cache - # - name: Build Container - # uses: docker/build-push-action@v5 - # with: - # context: . - # push: false - # load: true - # tag: ${{ github.repository }}:vuln-test - # cache-from: type=registry,ref=${{ github.repository }}:buildcache - # cache-to: type=registry,ref=${{ github.repository }}:buildcache,mode=max - # ####### Run for Fidelity ###### - # - name: Run Trivy vulnerability scanner - # uses: aquasecurity/trivy-action@master - # with: - # image-ref: '${{ github.repository }}:vuln-test' - # severity: 'CRITICAL,HIGH' - # exit-code: '1' - ################################### PYTEST ################################### pytest: runs-on: ubuntu-latest @@ -145,3 +106,47 @@ jobs: - name: Test with Bandit run: | bandit -r src -n3 -lll + +############################## test-image-build ############################## + test-image-build: + runs-on: ubuntu-latest + # if: ${{ github.actor == 'dependabot[bot]' }} + steps: + - uses: actions/checkout@v3 + + - name: Set short git commit SHA + id: vars + run: | + echo "short_sha=$(git rev-parse --short ${{ github.sha }})" >> $GITHUB_OUTPUT + # https://github.blog/changelog/2022-10-11-github-actions-deprecating-save-state-and-set-output-commands/ + + - name: Confirm git commit SHA output + run: echo ${{ steps.vars.outputs.short_sha }} + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + logout: true + + - name: Parse Github Reference Name + id: branch + run: | + REF=${{ github.ref_name }} + echo "GHR=${REF%/*}" >> $GITHUB_OUTPUT + + # Notes on Cache: + # https://docs.docker.com/build/ci/github-actions/examples/#inline-cache + - name: Build Container + uses: docker/build-push-action@v5 + with: + context: . + push: true + tags: | + ${{ github.repository }}:test_${{ steps.branch.outputs.GHR }} + cache-from: type=registry,ref=${{ github.repository }}:buildcache + cache-to: type=registry,ref=${{ github.repository }}:buildcache,mode=max \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index c009bc5..3980ddf 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,11 +3,15 @@ # A container for the core semantic-search capability. # ###################################################### -FROM python:3.12.0-alpine3.18 +FROM python:3.12.1-alpine3.19 + # Install required packages RUN apk update && \ - apk add g++ make + apk add g++ make + +#upgrade openssl \ +RUN apk add openssl=3.1.4-r4 RUN pip install --upgrade pip # Create a non-root user. @@ -31,4 +35,4 @@ RUN make install RUN make install.dug # Run it -ENTRYPOINT dug \ No newline at end of file +ENTRYPOINT dug diff --git a/docker-compose.yaml b/docker-compose.yaml index 8e59bd5..8e8d27d 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -56,7 +56,7 @@ services: ## ################################################################################# elasticsearch: - image: docker.elastic.co/elasticsearch/elasticsearch:8.5.2 + image: docker.elastic.co/elasticsearch/elasticsearch:8.11.3 networks: - dug-network environment: diff --git a/requirements.txt b/requirements.txt index bac13a6..2bbadab 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,6 +6,7 @@ elasticsearch[async]==8.5.2 gunicorn itsdangerous Jinja2 +jsonpickle jsonschema MarkupSafe ormar @@ -13,6 +14,7 @@ mistune pluggy pyrsistent pytest +pytest-asyncio pytz PyYAML requests @@ -26,4 +28,4 @@ click httpx linkml-runtime==1.6.0 bmt==1.1.0 -urllib3 \ No newline at end of file +urllib3 diff --git a/setup.cfg b/setup.cfg index b551ef3..0df3d5d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -17,7 +17,7 @@ classifiers = package_dir = = src packages = find: -python_requires = >=3.12 +python_requires = >=3.10 include_package_data = true install_requires = elasticsearch==8.5.2 diff --git a/src/dug/cli.py b/src/dug/cli.py index 4fd5923..f211e3a 100755 --- a/src/dug/cli.py +++ b/src/dug/cli.py @@ -55,7 +55,7 @@ def get_argparser(): '-a', '--annotator', help='Annotator used to annotate identifiers in crawl file', dest="annotator_type", - default="annotator-monarch" + default="monarch" ) crawl_parser.add_argument( diff --git a/src/dug/config.py b/src/dug/config.py index 5f4d59d..b070cac 100644 --- a/src/dug/config.py +++ b/src/dug/config.py @@ -9,8 +9,9 @@ @dataclass class Config: """ - TODO: Populate description + TODO: Populate description """ + elastic_password: str = "changeme" redis_password: str = "changeme" @@ -27,74 +28,102 @@ class Config: nboost_port: int = 8000 # Preprocessor config that will be passed to annotate.Preprocessor constructor - preprocessor: dict = field(default_factory=lambda: { - "debreviator": { - "BMI": "body mass index" - }, - "stopwords": ["the"] - }) - + preprocessor: dict = field( + default_factory=lambda: { + "debreviator": {"BMI": "body mass index"}, + "stopwords": ["the"], + } + ) + annotator_type: str = "monarch" # Annotator config that will be passed to annotate.Annotator constructor - annotator: dict = field(default_factory=lambda: { - "url": "https://api.monarchinitiative.org/api/nlp/annotate/entities?min_length=4&longest_only=false&include_abbreviation=false&include_acronym=false&include_numbers=false&content=" - }) + annotator_args: dict = field( + default_factory=lambda: { + "monarch": { + "url": "https://api.monarchinitiative.org/api/nlp/annotate/entities?min_length=4&longest_only=false&include_abbreviation=false&include_acronym=false&include_numbers=false&content=" + }, + "sapbert": { + "classification_url": "https://med-nemo.apps.renci.org/annotate/", + "annotator_url": "https://babel-sapbert.apps.renci.org/annotate/", + }, + } + ) # Normalizer config that will be passed to annotate.Normalizer constructor - normalizer: dict = field(default_factory=lambda: { - "url": "https://nodenormalization-dev.apps.renci.org/get_normalized_nodes?conflate=false&description=true&curie=" - }) + normalizer: dict = field( + default_factory=lambda: { + "url": "https://nodenormalization-dev.apps.renci.org/get_normalized_nodes?conflate=false&description=true&curie=" + } + ) # Synonym service config that will be passed to annotate.SynonymHelper constructor - synonym_service: dict = field(default_factory=lambda: { - "url": "https://name-resolution-sri.renci.org/reverse_lookup" - }) + synonym_service: dict = field( + default_factory=lambda: { + "url": "https://name-resolution-sri.renci.org/reverse_lookup" + } + ) # Ontology metadata helper config that will be passed to annotate.OntologyHelper constructor - ontology_helper: dict = field(default_factory=lambda: { - "url": "https://api.monarchinitiative.org/api/bioentity/" - }) + ontology_helper: dict = field( + default_factory=lambda: { + "url": "https://api.monarchinitiative.org/api/bioentity/" + } + ) # Redlist of identifiers not to expand via TranQL tranql_exclude_identifiers: list = field(default_factory=lambda: ["CHEBI:17336"]) - tranql_queries: dict = field(default_factory=lambda: { - "disease": ["disease", "phenotypic_feature"], - "pheno": ["phenotypic_feature", "disease"], - "anat": ["disease", "anatomical_entity"], - "chem_to_disease": ["chemical_entity", "disease"], - "small_molecule_to_disease": ["small_molecule", "disease"], - "chemical_mixture_to_disease": ["chemical_mixture", "disease"], - "phen_to_anat": ["phenotypic_feature", "anatomical_entity"], - }) - - node_to_element_queries: dict = field(default_factory=lambda: { - # Dug element type to cast the query kg nodes to - "cde": { - # Parse nodes matching criteria in kg - "node_type": "biolink:Publication", - "curie_prefix": "HEALCDE", - # list of attributes that are lists to be casted to strings - "list_field_choose_first": [ - "files" - ], - "attribute_mapping": { - # "DugElement Attribute" : "KG Node attribute" - "name": "name", - "desc": "summary", - "collection_name": "cde_category", - "collection_id": "cde_category", - "action": "files" + tranql_queries: dict = field( + default_factory=lambda: { + "disease": ["disease", "phenotypic_feature"], + "pheno": ["phenotypic_feature", "disease"], + "anat": ["disease", "anatomical_entity"], + "chem_to_disease": ["chemical_entity", "disease"], + "small_molecule_to_disease": ["small_molecule", "disease"], + "chemical_mixture_to_disease": ["chemical_mixture", "disease"], + "phen_to_anat": ["phenotypic_feature", "anatomical_entity"], + } + ) + + node_to_element_queries: dict = field( + default_factory=lambda: { + # Dug element type to cast the query kg nodes to + "cde": { + # Parse nodes matching criteria in kg + "node_type": "biolink:Publication", + "curie_prefix": "HEALCDE", + # list of attributes that are lists to be casted to strings + "list_field_choose_first": ["files"], + "attribute_mapping": { + # "DugElement Attribute" : "KG Node attribute" + "name": "name", + "desc": "summary", + "collection_name": "cde_category", + "collection_id": "cde_category", + "action": "files", + }, } } - }) + ) - concept_expander: dict = field(default_factory=lambda: { - "url": "https://tranql-dev.renci.org/tranql/query?dynamic_id_resolution=true&asynchronous=false", - "min_tranql_score": 0.0 - }) + concept_expander: dict = field( + default_factory=lambda: { + "url": "https://tranql-dev.renci.org/tranql/query?dynamic_id_resolution=true&asynchronous=false", + "min_tranql_score": 0.0, + } + ) # List of ontology types that can be used even if they fail normalization - ontology_greenlist: list = field(default_factory=lambda: ["PATO", "CHEBI", "MONDO", "UBERON", "HP", "MESH", "UMLS"]) + ontology_greenlist: list = field( + default_factory=lambda: [ + "PATO", + "CHEBI", + "MONDO", + "UBERON", + "HP", + "MESH", + "UMLS", + ] + ) @classmethod def from_env(cls): @@ -107,7 +136,7 @@ def from_env(cls): "elastic_password": "ELASTIC_PASSWORD", "redis_host": "REDIS_HOST", "redis_port": "REDIS_PORT", - "redis_password": "REDIS_PASSWORD" + "redis_password": "REDIS_PASSWORD", } kwargs = {} diff --git a/src/dug/core/__init__.py b/src/dug/core/__init__.py index 9fca7ce..effcb7b 100644 --- a/src/dug/core/__init__.py +++ b/src/dug/core/__init__.py @@ -63,7 +63,7 @@ def crawl(self, target_name: str, parser_type: str, annotator_type: str, element pm = get_plugin_manager() parser = get_parser(pm.hook, parser_type) - annotator = get_annotator(pm.hook, annotator_type) + annotator = get_annotator(pm.hook, annotator_type, self._factory.config) targets = get_targets(target_name) for target in targets: diff --git a/src/dug/core/annotate.py b/src/dug/core/annotate.py deleted file mode 100644 index 240752f..0000000 --- a/src/dug/core/annotate.py +++ /dev/null @@ -1,612 +0,0 @@ -# import json -# import logging -# import os -# import re -# import urllib.parse -# from typing import TypeVar, Generic, Union, List, Tuple, Optional -# import bmt -# import requests -# from requests import Session - -# import dug.core.tranql as tql - - -# logger = logging.getLogger('dug') - -# logging.getLogger("requests").setLevel(logging.WARNING) -# logging.getLogger("urllib3").setLevel(logging.WARNING) - - -# class Identifier: -# def __init__(self, id, label, types=None, search_text="", description=""): -# self.id = id -# self.label = label -# self.description = description -# if types is None: -# types = [] -# self.types = types -# self.search_text = [search_text] if search_text else [] -# self.equivalent_identifiers = [] -# self.synonyms = [] -# self.purl = "" - -# @property -# def id_type(self): -# return self.id.split(":")[0] - -# def add_search_text(self, text): -# # Add text only if it's unique and if not empty string -# if text and text not in self.search_text: -# self.search_text.append(text) - -# def get_searchable_dict(self): -# # Return a version of the identifier compatible with what's in ElasticSearch -# es_ident = { -# 'id': self.id, -# 'label': self.label, -# 'equivalent_identifiers': self.equivalent_identifiers, -# 'type': self.types, -# 'synonyms': self.synonyms -# } -# return es_ident - -# def jsonable(self): -# return self.__dict__ - - -# class DugAnnotator: -# def __init__( -# self, -# preprocessor: "Preprocessor", -# annotator: "Annotator", -# normalizer: "Normalizer", -# synonym_finder: "SynonymFinder", -# ontology_greenlist=[], -# ): -# self.preprocessor = preprocessor -# self.annotator = annotator -# self.normalizer = normalizer -# self.synonym_finder = synonym_finder -# self.ontology_greenlist = ontology_greenlist -# self.norm_fails_file = "norm_fails.txt" -# self.anno_fails_file = "anno_fails.txt" - -# def annotate(self, text, http_session): - -# # Preprocess text (debraviate, remove stopwords, etc.) -# text = self.preprocessor.preprocess(text) - -# # Fetch identifiers -# raw_identifiers = self.annotator.annotate(text, http_session) - -# # Write out to file if text fails to annotate -# if not raw_identifiers: -# with open(self.anno_fails_file, "a") as fh: -# fh.write(f'{text}\n') - -# processed_identifiers = [] -# for identifier in raw_identifiers: - -# # Normalize identifier using normalization service -# norm_id = self.normalizer.normalize(identifier, http_session) - -# # Skip adding id if it doesn't normalize -# if norm_id is None: -# # Write out to file if identifier doesn't normalize -# with open(self.norm_fails_file, "a") as fh: -# fh.write(f'{identifier.id}\n') - -# # Discard non-normalized ident if not in greenlist -# if identifier.id_type not in self.ontology_greenlist: -# continue - -# # If it is in greenlist just keep moving forward -# norm_id = identifier - -# # Add synonyms to identifier -# norm_id.synonyms = self.synonym_finder.get_synonyms(norm_id.id, http_session) - -# # Get pURL for ontology identifer for more info -# norm_id.purl = BioLinkPURLerizer.get_curie_purl(norm_id.id) -# processed_identifiers.append(norm_id) - -# return processed_identifiers - - -# class ConceptExpander: -# def __init__(self, url, min_tranql_score=0.2): -# self.url = url -# self.min_tranql_score = min_tranql_score -# self.include_node_keys = ["id", "name", "synonyms"] -# self.include_edge_keys = [] -# self.tranql_headers = {"accept": "application/json", "Content-Type": "text/plain"} - -# def is_acceptable_answer(self, answer): -# return True - -# def expand_identifier(self, identifier, query_factory, kg_filename, include_all_attributes=False): - -# answer_kgs = [] - -# # Skip TranQL query if a file exists in the crawlspace exists already, but continue w/ answers -# if os.path.exists(kg_filename): -# logger.info(f"identifier {identifier} is already crawled. Skipping TranQL query.") -# with open(kg_filename, 'r') as stream: -# response = json.load(stream) -# else: -# query = query_factory.get_query(identifier) -# logger.debug(query) -# response = requests.post( -# url=self.url, -# headers=self.tranql_headers, -# data=query).json() - -# # Case: Skip if empty KG -# try: -# if response["message"] == 'Internal Server Error' or len(response["message"]["knowledge_graph"]["nodes"]) == 0: -# logger.debug(f"Did not find a knowledge graph for {query}") -# logger.debug(f"{self.url} returned response: {response}") -# return [] -# except KeyError as e: -# logger.error(f"Could not find key: {e} in response: {response}") - -# # Dump out to file if there's a knowledge graph -# with open(kg_filename, 'w') as stream: -# json.dump(response, stream, indent=2) - -# # Get nodes in knowledge graph hashed by ids for easy lookup -# noMessage = (len(response.get("message",{})) == 0) -# statusError = (response.get("status","") == 'Error') -# if noMessage or statusError: -# # Skip on error -# logger.info(f"Error with identifier: {identifier}, response: {response}, kg_filename: '{kg_filename}'") -# return [] -# kg = tql.QueryKG(response) - -# for answer in kg.answers: -# # Filter out answers that don't meet some criteria -# # Right now just don't filter anything -# logger.debug(f"Answer: {answer}") -# if not self.is_acceptable_answer(answer): -# logger.warning("Skipping answer as it failed one or more acceptance criteria. See log for details.") -# continue - -# # Get subgraph containing only information for this answer -# try: -# # Temporarily surround in try/except because sometimes the answer graphs -# # contain invalid references to edges/nodes -# # This will be fixed in Robokop but for now just silently warn if answer is invalid -# node_attributes_filter = None if include_all_attributes else self.include_node_keys -# edge_attributes_filter = None if include_all_attributes else self.include_edge_keys -# answer_kg = kg.get_answer_subgraph(answer, -# include_node_keys=node_attributes_filter, -# include_edge_keys=edge_attributes_filter) - -# # Add subgraph to list of acceptable answers to query -# answer_kgs.append(answer_kg) - -# except tql.MissingNodeReferenceError: -# # TEMPORARY: Skip answers that have invalid node references -# # Need this to be fixed in Robokop -# logger.warning("Skipping answer due to presence of non-preferred id! " -# "See err msg for details.") -# continue -# except tql.MissingEdgeReferenceError: -# # TEMPORARY: Skip answers that have invalid edge references -# # Need this to be fixed in Robokop -# logger.warning("Skipping answer due to presence of invalid edge reference! " -# "See err msg for details.") -# continue - -# return answer_kgs - - -# class Preprocessor: -# """"Class for preprocessing strings so they are better interpreted by NLP steps""" - -# def __init__(self, debreviator=None, stopwords=None): -# if debreviator is None: -# debreviator = self.default_debreviator_factory() -# self.decoder = debreviator - -# if stopwords is None: -# stopwords = [] -# self.stopwords = stopwords - -# def preprocess(self, text: str) -> str: -# """ -# Apply debreviator to replace abbreviations and other characters - -# >>> pp = Preprocessor({"foo": "bar"}, ["baz"]) -# >>> pp.preprocess("Hello foo") -# 'Hello bar' - -# >>> pp.preprocess("Hello baz world") -# 'Hello world' -# """ - -# for key, value in self.decoder.items(): -# text = text.replace(key, value) - -# # Remove any stopwords -# text = " ".join([word for word in text.split() if word not in self.stopwords]) -# return text - -# @staticmethod -# def default_debreviator_factory(): -# return {"bmi": "body mass index", "_": " "} - - -# Input = TypeVar("Input") -# Output = TypeVar("Output") - - -# class ApiClient(Generic[Input, Output]): - -# def make_request(self, value: Input, http_session: Session): -# raise NotImplementedError() - -# def handle_response(self, value, response: Union[dict, list]) -> Output: -# raise NotImplementedError() - -# def __call__(self, value: Input, http_session: Session) -> Output: -# response = self.make_request(value, http_session) - -# result = self.handle_response(value, response) - -# return result - - -# class Annotator(ApiClient[str, List[Identifier]]): -# """ -# Use monarch API service to fetch ontology IDs found in text -# """ - -# def __init__(self, url: str): -# self.url = url - -# def sliding_window(self, text, max_characters=2000, padding_words=5): -# """ -# For long texts sliding window works as the following -# "aaaa bbb ccc ddd eeee" -# with a sliding max chars 8 and padding 1 -# first yeild would be "aaaa bbb" -# next subsequent yeilds "bbb ccc", "ccc ddd" , "ddd eeee" -# allowing context to be preserved with the scope of padding -# For a text of length 7653 , with max_characters 2000 and padding 5 , 4 chunks are yielded. -# """ -# words = text.split(' ') -# total_words = len(words) -# window_end = False -# current_index = 0 -# while not window_end: -# current_string = "" -# for index, word in enumerate(words[current_index: ]): -# if len(current_string) + len(word) + 1 >= max_characters: -# yield current_string + " " -# current_index += index - padding_words -# break -# appendee = word if index == 0 else " " + word -# current_string += appendee - -# if current_index + index == len(words) - 1: -# window_end = True -# yield current_string - -# def annotate(self, text, http_session): -# logger.debug(f"Annotating: {text}") -# identifiers = [] -# for chunk_text in self.sliding_window(text): -# identifiers += self(chunk_text, http_session) -# return identifiers - -# def make_request(self, value: Input, http_session: Session): -# value = urllib.parse.quote(value) -# url = f'{self.url}{value}' - -# # This could be moved to a config file -# NUM_TRIES = 5 -# for _ in range(NUM_TRIES): -# response = http_session.get(url) -# if response is not None: -# # looks like it worked -# break - -# # if the reponse is still None here, throw an error -# if response is None: -# raise RuntimeError(f"no response from {url}") -# return response.json() - -# def handle_response(self, value, response: dict) -> List[Identifier]: -# identifiers = [] -# """ Parse each identifier and initialize identifier object """ -# for span in response.get('spans', []): -# search_text = span.get('text', None) -# for token in span.get('token', []): -# curie = token.get('id', None) -# if not curie: -# continue - -# biolink_types = token.get('category') -# label = token.get('terms')[0] -# identifiers.append(Identifier(id=curie, -# label=label, -# types=biolink_types, -# search_text=search_text)) -# return identifiers - - -# class Normalizer(ApiClient[Identifier, Identifier]): -# def __init__(self, url): -# self.bl_toolkit = bmt.Toolkit() -# self.url = url - -# def normalize(self, identifier: Identifier, http_session: Session): -# # Use RENCI's normalization API service to get the preferred version of an identifier -# logger.debug(f"Normalizing: {identifier.id}") -# return self(identifier, http_session) - -# def make_request(self, value: Identifier, http_session: Session) -> dict: -# curie = value.id -# url = f"{self.url}{urllib.parse.quote(curie)}" -# try: -# response = http_session.get(url) -# except Exception as get_exc: -# logger.info(f"Error normalizing {value} at {url}") -# logger.error(f"Error {get_exc.__class__.__name__}: {get_exc}") -# return {} -# try: -# normalized = response.json() -# except Exception as json_exc: -# logger.info(f"Error processing response: {response.text} (HTTP {response.status_code})") -# logger.error(f"Error {json_exc.__class__.__name__}: {json_exc}") -# return {} - -# return normalized - -# def handle_response(self, identifier: Identifier, normalized: dict) -> Optional[Identifier]: -# """ Record normalized results. """ -# curie = identifier.id -# normalization = normalized.get(curie, {}) -# if normalization is None: -# logger.info(f"Normalization service did not return normalization for: {curie}") -# return None - -# preferred_id = normalization.get("id", {}) -# equivalent_identifiers = normalization.get("equivalent_identifiers", []) -# biolink_type = normalization.get("type", []) - -# # Return none if there isn't actually a preferred id -# if 'identifier' not in preferred_id: -# logger.debug(f"ERROR: normalize({curie})=>({preferred_id}). No identifier?") -# return None - -# logger.debug(f"Preferred id: {preferred_id}") -# identifier.id = preferred_id.get('identifier', '') -# identifier.label = preferred_id.get('label', '') -# identifier.description = preferred_id.get('description', '') -# identifier.equivalent_identifiers = [v['identifier'] for v in equivalent_identifiers] -# try: -# identifier.types = self.bl_toolkit.get_element(biolink_type[0]).name -# except: -# # converts biolink:SmallMolecule to small molecule -# identifier.types = (" ".join(re.split("(?=[A-Z])", biolink_type[0].replace('biolink:', ''))[1:])).lower() -# return identifier - - -# class SynonymFinder(ApiClient[str, List[str]]): - -# def __init__(self, url: str): -# self.url = url - -# def get_synonyms(self, curie: str, http_session): -# ''' -# This function uses the NCATS translator service to return a list of synonyms for -# curie id -# ''' - -# return self(curie, http_session) - -# def make_request(self, curie: str, http_session: Session): -# # Get response from namelookup reverse lookup op -# # example (https://name-resolution-sri.renci.org/docs#/lookup/lookup_names_reverse_lookup_post) -# url = f"{self.url}" -# payload = { -# 'curies': [curie] -# } -# try: -# response = http_session.post(url, json=payload) -# if str(response.status_code).startswith('4'): -# logger.error(f"No synonyms returned for: `{curie}`. Validation error: {response.text}") -# return {curie: []} -# if str(response.status_code).startswith('5'): -# logger.error(f"No synonyms returned for: `{curie}`. Internal server error from {self.url}. Error: {response.text}") -# return {curie: []} -# return response.json() -# except json.decoder.JSONDecodeError as e: -# logger.error(f"Json parse error for response from `{url}`. Exception: {str(e)}") -# return {curie: []} - -# def handle_response(self, curie: str, raw_synonyms: List[dict]) -> List[str]: -# # Return curie synonyms -# return raw_synonyms.get(curie, []) - - - - - -# class BioLinkPURLerizer: -# # Static class for the sole purpose of doing lookups of different ontology PURLs -# # Is it pretty? No. But it gets the job done. -# biolink_lookup = {"APO": "http://purl.obolibrary.org/obo/APO_", -# "Aeolus": "http://translator.ncats.nih.gov/Aeolus_", -# "BIOGRID": "http://identifiers.org/biogrid/", -# "BIOSAMPLE": "http://identifiers.org/biosample/", -# "BSPO": "http://purl.obolibrary.org/obo/BSPO_", -# "CAID": "http://reg.clinicalgenome.org/redmine/projects/registry/genboree_registry/by_caid?caid=", -# "CHEBI": "http://purl.obolibrary.org/obo/CHEBI_", -# "CHEMBL.COMPOUND": "http://identifiers.org/chembl.compound/", -# "CHEMBL.MECHANISM": "https://www.ebi.ac.uk/chembl/mechanism/inspect/", -# "CHEMBL.TARGET": "http://identifiers.org/chembl.target/", -# "CID": "http://pubchem.ncbi.nlm.nih.gov/compound/", -# "CL": "http://purl.obolibrary.org/obo/CL_", -# "CLINVAR": "http://identifiers.org/clinvar/", -# "CLO": "http://purl.obolibrary.org/obo/CLO_", -# "COAR_RESOURCE": "http://purl.org/coar/resource_type/", -# "CPT": "https://www.ama-assn.org/practice-management/cpt/", -# "CTD": "http://translator.ncats.nih.gov/CTD_", -# "ClinVarVariant": "http://www.ncbi.nlm.nih.gov/clinvar/variation/", -# "DBSNP": "http://identifiers.org/dbsnp/", -# "DGIdb": "https://www.dgidb.org/interaction_types", -# "DOID": "http://purl.obolibrary.org/obo/DOID_", -# "DRUGBANK": "http://identifiers.org/drugbank/", -# "DrugCentral": "http://translator.ncats.nih.gov/DrugCentral_", -# "EC": "http://www.enzyme-database.org/query.php?ec=", -# "ECTO": "http://purl.obolibrary.org/obo/ECTO_", -# "EDAM-DATA": "http://edamontology.org/data_", -# "EDAM-FORMAT": "http://edamontology.org/format_", -# "EDAM-OPERATION": "http://edamontology.org/operation_", -# "EDAM-TOPIC": "http://edamontology.org/topic_", -# "EFO": "http://identifiers.org/efo/", -# "ENSEMBL": "http://identifiers.org/ensembl/", -# "ExO": "http://purl.obolibrary.org/obo/ExO_", -# "FAO": "http://purl.obolibrary.org/obo/FAO_", -# "FB": "http://identifiers.org/fb/", -# "FBcv": "http://purl.obolibrary.org/obo/FBcv_", -# "FlyBase": "http://flybase.org/reports/", -# "GAMMA": "http://translator.renci.org/GAMMA_", -# "GO": "http://purl.obolibrary.org/obo/GO_", -# "GOLD.META": "http://identifiers.org/gold.meta/", -# "GOP": "http://purl.obolibrary.org/obo/go#", -# "GOREL": "http://purl.obolibrary.org/obo/GOREL_", -# "GSID": "https://scholar.google.com/citations?user=", -# "GTEx": "https://www.gtexportal.org/home/gene/", -# "HANCESTRO": "http://www.ebi.ac.uk/ancestro/ancestro_", -# "HCPCS": "http://purl.bioontology.org/ontology/HCPCS/", -# "HGNC": "http://identifiers.org/hgnc/", -# "HGNC.FAMILY": "http://identifiers.org/hgnc.family/", -# "HMDB": "http://identifiers.org/hmdb/", -# "HP": "http://purl.obolibrary.org/obo/HP_", -# "ICD0": "http://translator.ncats.nih.gov/ICD0_", -# "ICD10": "http://translator.ncats.nih.gov/ICD10_", -# "ICD9": "http://translator.ncats.nih.gov/ICD9_", -# "INCHI": "http://identifiers.org/inchi/", -# "INCHIKEY": "http://identifiers.org/inchikey/", -# "INTACT": "http://identifiers.org/intact/", -# "IUPHAR.FAMILY": "http://identifiers.org/iuphar.family/", -# "KEGG": "http://identifiers.org/kegg/", -# "LOINC": "http://loinc.org/rdf/", -# "MEDDRA": "http://identifiers.org/meddra/", -# "MESH": "http://identifiers.org/mesh/", -# "MGI": "http://identifiers.org/mgi/", -# "MI": "http://purl.obolibrary.org/obo/MI_", -# "MIR": "http://identifiers.org/mir/", -# "MONDO": "http://purl.obolibrary.org/obo/MONDO_", -# "MP": "http://purl.obolibrary.org/obo/MP_", -# "MSigDB": "https://www.gsea-msigdb.org/gsea/msigdb/", -# "MetaCyc": "http://translator.ncats.nih.gov/MetaCyc_", -# "NCBIGENE": "http://identifiers.org/ncbigene/", -# "NCBITaxon": "http://purl.obolibrary.org/obo/NCBITaxon_", -# "NCIT": "http://purl.obolibrary.org/obo/NCIT_", -# "NDDF": "http://purl.bioontology.org/ontology/NDDF/", -# "NLMID": "https://www.ncbi.nlm.nih.gov/nlmcatalog/?term=", -# "OBAN": "http://purl.org/oban/", -# "OBOREL": "http://purl.obolibrary.org/obo/RO_", -# "OIO": "http://www.geneontology.org/formats/oboInOwl#", -# "OMIM": "http://purl.obolibrary.org/obo/OMIM_", -# "ORCID": "https://orcid.org/", -# "ORPHA": "http://www.orpha.net/ORDO/Orphanet_", -# "ORPHANET": "http://identifiers.org/orphanet/", -# "PANTHER.FAMILY": "http://identifiers.org/panther.family/", -# "PANTHER.PATHWAY": "http://identifiers.org/panther.pathway/", -# "PATO-PROPERTY": "http://purl.obolibrary.org/obo/pato#", -# "PDQ": "https://www.cancer.gov/publications/pdq#", -# "PHARMGKB.DRUG": "http://identifiers.org/pharmgkb.drug/", -# "PHARMGKB.PATHWAYS": "http://identifiers.org/pharmgkb.pathways/", -# "PHAROS": "http://pharos.nih.gov", -# "PMID": "http://www.ncbi.nlm.nih.gov/pubmed/", -# "PO": "http://purl.obolibrary.org/obo/PO_", -# "POMBASE": "http://identifiers.org/pombase/", -# "PR": "http://purl.obolibrary.org/obo/PR_", -# "PUBCHEM.COMPOUND": "http://identifiers.org/pubchem.compound/", -# "PUBCHEM.SUBSTANCE": "http://identifiers.org/pubchem.substance/", -# "PathWhiz": "http://smpdb.ca/pathways/#", -# "REACT": "http://www.reactome.org/PathwayBrowser/#/", -# "REPODB": "http://apps.chiragjpgroup.org/repoDB/", -# "RGD": "http://identifiers.org/rgd/", -# "RHEA": "http://identifiers.org/rhea/", -# "RNACENTRAL": "http://identifiers.org/rnacentral/", -# "RO": "http://purl.obolibrary.org/obo/RO_", -# "RTXKG1": "http://kg1endpoint.rtx.ai/", -# "RXNORM": "http://purl.bioontology.org/ontology/RXNORM/", -# "ResearchID": "https://publons.com/researcher/", -# "SEMMEDDB": "https://skr3.nlm.nih.gov/SemMedDB", -# "SGD": "http://identifiers.org/sgd/", -# "SIO": "http://semanticscience.org/resource/SIO_", -# "SMPDB": "http://identifiers.org/smpdb/", -# "SNOMEDCT": "http://identifiers.org/snomedct/", -# "SNPEFF": "http://translator.ncats.nih.gov/SNPEFF_", -# "ScopusID": "https://www.scopus.com/authid/detail.uri?authorId=", -# "TAXRANK": "http://purl.obolibrary.org/obo/TAXRANK_", -# "UBERGRAPH": "http://translator.renci.org/ubergraph-axioms.ofn#", -# "UBERON": "http://purl.obolibrary.org/obo/UBERON_", -# "UBERON_CORE": "http://purl.obolibrary.org/obo/uberon/core#", -# "UMLS": "http://identifiers.org/umls/", -# "UMLSSC": "https://metamap.nlm.nih.gov/Docs/SemanticTypes_2018AB.txt/code#", -# "UMLSSG": "https://metamap.nlm.nih.gov/Docs/SemGroups_2018.txt/group#", -# "UMLSST": "https://metamap.nlm.nih.gov/Docs/SemanticTypes_2018AB.txt/type#", -# "UNII": "http://identifiers.org/unii/", -# "UPHENO": "http://purl.obolibrary.org/obo/UPHENO_", -# "UniProtKB": "http://identifiers.org/uniprot/", -# "VANDF": "https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/VANDF/", -# "VMC": "https://github.com/ga4gh/vr-spec/", -# "WB": "http://identifiers.org/wb/", -# "WBPhenotype": "http://purl.obolibrary.org/obo/WBPhenotype_", -# "WBVocab": "http://bio2rdf.org/wormbase_vocabulary", -# "WIKIDATA": "https://www.wikidata.org/wiki/", -# "WIKIDATA_PROPERTY": "https://www.wikidata.org/wiki/Property:", -# "WIKIPATHWAYS": "http://identifiers.org/wikipathways/", -# "WormBase": "https://www.wormbase.org/get?name=", -# "ZFIN": "http://identifiers.org/zfin/", -# "ZP": "http://purl.obolibrary.org/obo/ZP_", -# "alliancegenome": "https://www.alliancegenome.org/", -# "biolink": "https://w3id.org/biolink/vocab/", -# "biolinkml": "https://w3id.org/biolink/biolinkml/", -# "chembio": "http://translator.ncats.nih.gov/chembio_", -# "dcterms": "http://purl.org/dc/terms/", -# "dictyBase": "http://dictybase.org/gene/", -# "doi": "https://doi.org/", -# "fabio": "http://purl.org/spar/fabio/", -# "foaf": "http://xmlns.com/foaf/0.1/", -# "foodb.compound": "http://foodb.ca/compounds/", -# "gff3": "https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md#", -# "gpi": "https://github.com/geneontology/go-annotation/blob/master/specs/gpad-gpi-2-0.md#", -# "gtpo": "https://rdf.guidetopharmacology.org/ns/gtpo#", -# "hetio": "http://translator.ncats.nih.gov/hetio_", -# "interpro": "https://www.ebi.ac.uk/interpro/entry/", -# "isbn": "https://www.isbn-international.org/identifier/", -# "isni": "https://isni.org/isni/", -# "issn": "https://portal.issn.org/resource/ISSN/", -# "medgen": "https://www.ncbi.nlm.nih.gov/medgen/", -# "oboformat": "http://www.geneontology.org/formats/oboInOWL#", -# "pav": "http://purl.org/pav/", -# "prov": "http://www.w3.org/ns/prov#", -# "qud": "http://qudt.org/1.1/schema/qudt#", -# "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", -# "rdfs": "http://www.w3.org/2000/01/rdf-schema#", -# "skos": "https://www.w3.org/TR/skos-reference/#", -# "wgs": "http://www.w3.org/2003/01/geo/wgs84_pos", -# "xsd": "http://www.w3.org/2001/XMLSchema#", -# "@vocab": "https://w3id.org/biolink/vocab/"} - -# @staticmethod -# def get_curie_purl(curie): -# # Split into prefix and suffix -# suffix = curie.split(":")[1] -# prefix = curie.split(":")[0] - -# # Check to see if the prefix exists in the hash -# if prefix not in BioLinkPURLerizer.biolink_lookup: -# return None - -# return f"{BioLinkPURLerizer.biolink_lookup[prefix]}{suffix}" \ No newline at end of file diff --git a/src/dug/core/annotators/__init__.py b/src/dug/core/annotators/__init__.py index 1a58c40..60b43df 100644 --- a/src/dug/core/annotators/__init__.py +++ b/src/dug/core/annotators/__init__.py @@ -13,22 +13,23 @@ hookimpl = pluggy.HookimplMarker("dug") @hookimpl -def define_annotators(annotator_dict: Dict[str, Annotator]): - annotator_dict["annotator-monarch"] = build_monarch_annotator() - annotator_dict["annotator-sapbert"] = build_sapbert_annotator() +def define_annotators(annotator_dict: Dict[str, Annotator], config: Config): + annotator_dict["monarch"] = build_monarch_annotator("monarch", config=config) + annotator_dict["sapbert"] = build_sapbert_annotator("sapbert", config=config) class AnnotatorNotFoundException(Exception): ... -def get_annotator(hook, annotator_name) -> Annotator: +def get_annotator(hook, annotator_name, config: Config) -> Annotator: """Get the annotator from all annotators registered via the define_annotators hook""" available_annotators = {} - hook.define_annotators(annotator_dict=available_annotators) + hook.define_annotators(annotator_dict=available_annotators, config=config) annotator = available_annotators.get(annotator_name.lower()) if annotator is not None: + logger.info(f'Annotating with {annotator}') return annotator err_msg = f"Cannot find annotator of type '{annotator_name}'\n" \ @@ -36,21 +37,22 @@ def get_annotator(hook, annotator_name) -> Annotator: logger.error(err_msg) raise AnnotatorNotFoundException(err_msg) -def build_monarch_annotator(): - config = Config.from_env() +def build_monarch_annotator(annotate_type: str, config: Config): + logger.info(f"Building Monarch annotator with args: {config.annotator_args[annotate_type]}") annotator = AnnotateMonarch( normalizer=DefaultNormalizer(**config.normalizer), synonym_finder=DefaultSynonymFinder(**config.synonym_service), config=config, + **config.annotator_args[annotate_type] ) - return annotator -def build_sapbert_annotator(): - config = Config.from_env() +def build_sapbert_annotator(annotate_type, config: Config): + logger.info(f"Building Sapbert annotator with args: {config.annotator_args[annotate_type]}") annotator = AnnotateSapbert( normalizer=DefaultNormalizer(**config.normalizer), synonym_finder=DefaultSynonymFinder(**config.synonym_service), + **config.annotator_args[annotate_type] ) return annotator diff --git a/src/dug/core/annotators/_base.py b/src/dug/core/annotators/_base.py index c725bff..0589051 100644 --- a/src/dug/core/annotators/_base.py +++ b/src/dug/core/annotators/_base.py @@ -13,24 +13,34 @@ logging.getLogger("requests").setLevel(logging.WARNING) logging.getLogger("urllib3").setLevel(logging.WARNING) - class DugIdentifier: - """ The Dug Identifier is the core piece of information about a concept that produced from a request to an annotator based on a some original source of data. - \n The information that is being stored is mostly meant to support the Monarch API but should be adjusted accordingly to suit new Annotators needs in the future. + """Core information about a concept, produced from annotator request + + The Dug Identifier is the core piece of information about a concept that + produced from a request to an annotator based on a some original source of + data. + + \n The information that is being stored is mostly meant to support the + Monarch API but should be adjusted accordingly to suit new Annotators needs + in the future. \n The information that will be needed for all annotators are: \n id: The CURIE identifier \n label: The CURIE identifier \n description: The CURIE identifier - \n When there is another supported Normalizer it will be seperated into a separate plugin like annotator. + \n When there is another supported Normalizer it will be seperated into a + separate plugin like annotator. """ + def __init__(self, id, label, types=None, search_text="", description=""): + "custom init stores parameters to initial values" + self.id = id self.label = label self.description = description if types is None: types = [] self.types = types - self.search_text = [search_text] if search_text else [] + self.search_text = sorted([search_text]) if search_text else [] self.equivalent_identifiers = [] self.synonyms = [] self.purl = "" @@ -40,12 +50,12 @@ def id_type(self): return self.id.split(":")[0] def add_search_text(self, text): - # Add text only if it's unique and if not empty string + "Add text only if it's unique and if not empty string" if text and text not in self.search_text: - self.search_text.append(text) + self.search_text = sorted(self.search_text + [text]) def get_searchable_dict(self): - # Return a version of the identifier compatible with what's in ElasticSearch + "Return version of identifier compatible with what's in ElasticSearch" es_ident = { "id": self.id, "label": self.label, @@ -56,8 +66,10 @@ def get_searchable_dict(self): return es_ident def jsonable(self): + "Output pickleable object (used by utils.complex_handler)" return self.__dict__ + def __str__(self): return json.dumps(self.__dict__, indent=2, default=utils.complex_handler) @@ -82,9 +94,18 @@ def __call__(self, value: Input, http_session: Session) -> Output: class DefaultNormalizer(): - """ After annotation there must be a Noramlizing step to collasce equivalent concepts into one official concept. This is a needed step for the knowledge graph to map between different concepts. - \n The reason why this class in integrated into the annotators.py is because currently there is only one supported Normalizer through the NCATs Translator API. - \n When there is another supported Normalizer it will be seperated into a separate plugin like annotator. + """Default concept normalizer class + + After annotation there must be a Normalizing step to collasce equivalent + concepts into one official concept. This is a needed step for the knowledge + graph to map between different concepts. + + The reason why this class in integrated into the annotators.py is because + currently there is only one supported Normalizer through the NCATs + Translator API. + + When there is another supported Normalizer it will be seperated into a + separate plugin like annotator. """ def __init__(self, url): diff --git a/src/dug/core/annotators/monarch_annotator.py b/src/dug/core/annotators/monarch_annotator.py index 1c67f40..e50e317 100644 --- a/src/dug/core/annotators/monarch_annotator.py +++ b/src/dug/core/annotators/monarch_annotator.py @@ -21,9 +21,10 @@ def __init__( synonym_finder, config, ontology_greenlist=[], + **kwargs ): - self.annotatorUrl = config.annotator['url'] + self.annotatorUrl = kwargs['url'] self.normalizer = normalizer self.synonym_finder = synonym_finder self.ontology_greenlist = ontology_greenlist @@ -42,7 +43,6 @@ def __init__( self.stopwords = stopwords def __call__(self, text, http_session) -> List[DugIdentifier]: - # Preprocess text (debraviate, remove stopwords, etc.) text = self.preprocess_text(text) diff --git a/src/dug/core/annotators/sapbert_annotator.py b/src/dug/core/annotators/sapbert_annotator.py index 7c2fa81..6f2c93a 100644 --- a/src/dug/core/annotators/sapbert_annotator.py +++ b/src/dug/core/annotators/sapbert_annotator.py @@ -22,9 +22,14 @@ def __init__( normalizer, synonym_finder, ontology_greenlist=[], + **kwargs ): - self.classificationUrl = "https://med-nemo.apps.renci.org/annotate/" - self.annotatorUrl = "https://babel-sapbert.apps.renci.org/annotate/" + self.classificationUrl = kwargs.get('classification_url') + self.annotatorUrl = kwargs.get('annotator_url') + if not self.classificationUrl: + raise TypeError('Classification url needs to be defined for sapbert annotator') + if not self.annotatorUrl: + raise TypeError('Annotator url needs to be defined for sapbert annotator') self.normalizer = normalizer self.synonym_finder = synonym_finder self.ontology_greenlist = ontology_greenlist diff --git a/src/dug/core/async_search.py b/src/dug/core/async_search.py index 44d7c98..b39e6a9 100644 --- a/src/dug/core/async_search.py +++ b/src/dug/core/async_search.py @@ -50,12 +50,12 @@ def __init__(self, cfg: Config, indices=None): cafile=self._cfg.elastic_ca_path ) self.es = AsyncElasticsearch(hosts=self.hosts, - http_auth=(self._cfg.elastic_username, + basic_auth=(self._cfg.elastic_username, self._cfg.elastic_password), ssl_context=ssl_context) else: self.es = AsyncElasticsearch(hosts=self.hosts, - http_auth=(self._cfg.elastic_username, + basic_auth=(self._cfg.elastic_username, self._cfg.elastic_password)) async def dump_concepts(self, index, query={}, size=None, diff --git a/src/dug/core/crawler.py b/src/dug/core/crawler.py index 4dee2a3..ae58355 100644 --- a/src/dug/core/crawler.py +++ b/src/dug/core/crawler.py @@ -151,6 +151,7 @@ def annotate_element(self, element): # Each identifier then becomes a concept that links elements together + logger.info("Got %d identifiers for %s", len(identifiers) , element.ml_ready_desc) for identifier in identifiers: if identifier.id not in self.concepts: # Create concept for newly seen identifier diff --git a/src/dug/core/factory.py b/src/dug/core/factory.py index 6037f97..0bedab2 100644 --- a/src/dug/core/factory.py +++ b/src/dug/core/factory.py @@ -4,12 +4,6 @@ from requests_cache import CachedSession import dug.core.tranql as tql -# from dug.core.annotate import (DugAnnotator, -# # Annotator, -# Normalizer, -# Preprocessor, -# SynonymFinder, -# ConceptExpander) from dug.core.concept_expander import ConceptExpander from dug.config import Config as DugConfig, TRANQL_SOURCE from dug.core.crawler import Crawler @@ -53,22 +47,6 @@ def build_crawler(self, target, parser: Parser, annotator: Annotator, element_ty return crawler - # def build_annotator(self) -> Annotator: - - # preprocessor = Preprocessor(**self.config.preprocessor) - # annotator = Annotate(**self.config.annotator) - # normalizer = Normalizer(**self.config.normalizer) - # synonym_finder = SynonymFinder(**self.config.synonym_service) - - # annotator = Annotator( - # preprocessor=preprocessor, - # annotator=annotator, - # normalizer=normalizer, - # synonym_finder=synonym_finder - # ) - - # return annotator - def build_tranqlizer(self) -> ConceptExpander: return ConceptExpander(**self.config.concept_expander) diff --git a/src/dug/core/index.py b/src/dug/core/index.py index 93a2d58..0491d06 100644 --- a/src/dug/core/index.py +++ b/src/dug/core/index.py @@ -30,12 +30,12 @@ def __init__(self, cfg: Config, indices=None): ) self.es = Elasticsearch( hosts=self.hosts, - http_auth=(self._cfg.elastic_username, self._cfg.elastic_password), + basic_auth=(self._cfg.elastic_username, self._cfg.elastic_password), ssl_context=ssl_context) else: self.es = Elasticsearch( hosts=self.hosts, - http_auth=(self._cfg.elastic_username, self._cfg.elastic_password)) + basic_auth=(self._cfg.elastic_username, self._cfg.elastic_password)) self.replicas = self.get_es_node_count() if self.es.ping(): diff --git a/src/dug/core/parsers/_base.py b/src/dug/core/parsers/_base.py index acfc5bb..f6d3b77 100644 --- a/src/dug/core/parsers/_base.py +++ b/src/dug/core/parsers/_base.py @@ -29,6 +29,7 @@ def add_concept(self, concept): self.concepts[concept.id] = concept def jsonable(self): + """Output a pickleable object""" return self.__dict__ def get_searchable_dict(self): @@ -55,7 +56,7 @@ def set_search_terms(self): concept.set_search_terms() search_terms.extend(concept.search_terms) search_terms.append(concept.name) - search_terms = list(set(search_terms)) + search_terms = sorted(list(set(search_terms))) self.search_terms = search_terms def set_optional_terms(self): @@ -63,7 +64,7 @@ def set_optional_terms(self): for concept_id, concept in self.concepts.items(): concept.set_optional_terms() optional_terms.extend(concept.optional_terms) - optional_terms = list(set(optional_terms)) + optional_terms = sorted(list(set(optional_terms))) self.optional_terms = optional_terms def __str__(self): @@ -99,15 +100,15 @@ def add_kg_answer(self, answer, query_name): self.kg_answers[answer_id] = answer def clean(self): - self.search_terms = list(set(self.search_terms)) - self.optional_terms = list(set(self.optional_terms)) + self.search_terms = sorted(list(set(self.search_terms))) + self.optional_terms = sorted(list(set(self.optional_terms))) def set_search_terms(self): # Traverse set of identifiers to determine set of search terms search_terms = self.search_terms for ident_id, ident in self.identifiers.items(): search_terms.extend(ident.search_text + ident.synonyms) - self.search_terms = list(set(search_terms)) + self.search_terms = sorted(list(set(search_terms))) def set_optional_terms(self): # Traverse set of knowledge graph answers to determine set of optional search terms @@ -115,7 +116,7 @@ def set_optional_terms(self): for kg_id, kg_answer in self.kg_answers.items(): optional_terms += kg_answer.get_node_names() optional_terms += kg_answer.get_node_synonyms() - self.optional_terms = list(set(optional_terms)) + self.optional_terms = sorted(list(set(optional_terms))) def get_searchable_dict(self): # Translate DugConcept into Elastic-Compatible Concept @@ -132,6 +133,7 @@ def get_searchable_dict(self): return es_conc def jsonable(self): + """Output a pickleable object""" return self.__dict__ def __str__(self): @@ -142,4 +144,4 @@ def __str__(self): Parser = Callable[[Any], Iterable[Indexable]] -FileParser = Callable[[InputFile], Iterable[Indexable]] +FileParser = Callable[[InputFile], Iterable[Indexable]] \ No newline at end of file diff --git a/src/dug/hookspecs.py b/src/dug/hookspecs.py index 96b984b..9687b15 100644 --- a/src/dug/hookspecs.py +++ b/src/dug/hookspecs.py @@ -4,6 +4,7 @@ from dug.core.parsers import Parser from dug.core.annotators import Annotator +from dug.config import Config hookspec = pluggy.HookspecMarker("dug") @@ -15,7 +16,7 @@ def define_parsers(parser_dict: Dict[str, Parser]): ... @hookspec -def define_annotators(annotator_dict: Dict[str, Annotator]): +def define_annotators(annotator_dict: Dict[str, Annotator], config: Config): """Defines what Annotators are available to Dug """ ... diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 7bc0bcf..50f5787 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -5,7 +5,7 @@ from dataclasses import dataclass from typing import Dict -import pytest +import pytest_asyncio TEST_DATA_DIR = Path(__file__).parent.resolve() / "data" @@ -45,7 +45,7 @@ def post(self, url, params: dict = None, json: dict = {}): return MockResponse(text, status_code=status_code) -@pytest.fixture +@pytest_asyncio.fixture def monarch_annotator_api(): base_url = "http://annotator.api/?content={query}" @@ -94,7 +94,7 @@ def _(keyword): ) -@pytest.fixture +@pytest_asyncio.fixture def token_classifier_api(): return MockApiService( urls={ @@ -118,11 +118,11 @@ def token_classifier_api(): ) -@pytest.fixture +@pytest_asyncio.fixture def sapbert_annotator_api(): return MockApiService( urls={ - "https://babel-sapbert.apps.renci.org/annotate/": [ + "https://med-nemo.apps.renci.org/annotate/": [ json.dumps( [ { @@ -145,7 +145,7 @@ def sapbert_annotator_api(): ) -@pytest.fixture +@pytest_asyncio.fixture def normalizer_api(): base_url = "http://normalizer.api/?curie={curie}" @@ -188,7 +188,7 @@ def _(curie): ) -@pytest.fixture +@pytest_asyncio.fixture def null_normalizer_api(): base_url = "http://normalizer.api/?curie={curie}" @@ -211,7 +211,7 @@ def _(curie): ) -@pytest.fixture +@pytest_asyncio.fixture def synonym_api(): return MockApiService( urls={ @@ -234,7 +234,7 @@ def synonym_api(): ) -@pytest.fixture +@pytest_asyncio.fixture def null_synonym_api(): return MockApiService( urls={"http://synonyms.api": [json.dumps({"XAO:0000336": {"names":[]}}), 200]} diff --git a/tests/integration/mocks/mock_config.py b/tests/integration/mocks/mock_config.py index 27ca191..82bcd1b 100644 --- a/tests/integration/mocks/mock_config.py +++ b/tests/integration/mocks/mock_config.py @@ -12,10 +12,21 @@ class MockConfig: "stopwords": ["the"] }) + # Annotator config that will be passed to annotate.Annotator constructor - annotator: dict = field(default_factory=lambda: { - "url": "http://annotator.api/?content=" - }) + annotator_type: str = "monarch" + + annotator_args: dict = field( + default_factory=lambda: { + "monarch": { + "url": "http://annotator.api/?content=" + }, + "sapbert": { + "classification_url": "https://med-nemo.apps.renci.org/annotate/", + "annotator_url": "https://med-nemo.apps.renci.org/annotate/", + }, + } + ) # Normalizer config that will be passed to annotate.Normalizer constructor normalizer: dict = field(default_factory=lambda: { diff --git a/tests/integration/test_annotators.py b/tests/integration/test_annotators.py index 8004d0d..eecfd1e 100644 --- a/tests/integration/test_annotators.py +++ b/tests/integration/test_annotators.py @@ -28,7 +28,7 @@ def test_monarch_annotation_full( synonym_finder = DefaultSynonymFinder(**cfg.synonym_service) annotator = AnnotateMonarch( - normalizer=normalizer, synonym_finder=synonym_finder, config=cfg + normalizer=normalizer, synonym_finder=synonym_finder, config=cfg, **cfg.annotator_args["monarch"] ) input_text = "heart attack" @@ -95,7 +95,7 @@ def test_sapbert_annotation_full( normalizer = DefaultNormalizer(**cfg.normalizer) synonym_finder = DefaultSynonymFinder(**cfg.synonym_service) - annotator = AnnotateSapbert(normalizer=normalizer, synonym_finder=synonym_finder) + annotator = AnnotateSapbert(normalizer=normalizer, synonym_finder=synonym_finder, **cfg.annotator_args["sapbert"]) input_text = "Have you ever had a heart attack?" # Fetch Classifiers diff --git a/tests/integration/test_index.py b/tests/integration/test_index.py index 31d0d3d..829e4ba 100644 --- a/tests/integration/test_index.py +++ b/tests/integration/test_index.py @@ -21,7 +21,7 @@ def is_elastic_up(): try: es = Elasticsearch( hosts=hosts, - http_auth=(username, password) + basic_auth=(username, password) ) return es.ping() except Exception: diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index f40d4f6..87f2edc 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -3,8 +3,7 @@ from dataclasses import dataclass from typing import Dict -import pytest - +import pytest_asyncio @dataclass class MockResponse: @@ -41,7 +40,7 @@ def post(self, url, params: dict = None, json: dict = {}): return MockResponse(text, status_code=status_code) -@pytest.fixture +@pytest_asyncio.fixture def annotator_api(): base_url = "http://annotator.api/?content={query}" @@ -150,7 +149,7 @@ def _(keyword): ) -@pytest.fixture +@pytest_asyncio.fixture def normalizer_api(): base_url = "http://normalizer.api/?curie={curie}" @@ -193,7 +192,7 @@ def _(curie): ) -@pytest.fixture +@pytest_asyncio.fixture def synonym_api(): return MockApiService( urls={ @@ -216,7 +215,7 @@ def synonym_api(): ) -@pytest.fixture() +@pytest_asyncio.fixture() def ontology_api(): base_url = "http://ontology.api/?curie={curie}" diff --git a/tests/unit/mocks/data/mock_config.py b/tests/unit/mocks/data/mock_config.py index 27ca191..d70f8a3 100644 --- a/tests/unit/mocks/data/mock_config.py +++ b/tests/unit/mocks/data/mock_config.py @@ -13,9 +13,19 @@ class MockConfig: }) # Annotator config that will be passed to annotate.Annotator constructor - annotator: dict = field(default_factory=lambda: { - "url": "http://annotator.api/?content=" - }) + annotator_type: str = "monarch" + + annotator_args: dict = field( + default_factory=lambda: { + "monarch": { + "url": "http://annotator.api/?content=" + }, + "sapbert": { + "classification_url": "http://classifier.api/annotate/", + "annotator_url": "http://entity-link.api/annotate/", + }, + } + ) # Normalizer config that will be passed to annotate.Normalizer constructor normalizer: dict = field(default_factory=lambda: { diff --git a/tests/unit/test_annotators.py b/tests/unit/test_annotators.py index c1702ee..830a140 100644 --- a/tests/unit/test_annotators.py +++ b/tests/unit/test_annotators.py @@ -29,7 +29,7 @@ def test_annotator(annotator_api): synonym_finder = DefaultSynonymFinder(cfg.synonym_service) annotator = AnnotateMonarch( - normalizer=normalizer, synonym_finder=synonym_finder, config=cfg + normalizer=normalizer, synonym_finder=synonym_finder, config=cfg , **cfg.annotator_args["monarch"] ) text = "heart attack" identifiers: List[DugIdentifier] = annotator.annotate_text( diff --git a/tests/unit/test_api.py b/tests/unit/test_api.py index e55b688..cd35ba3 100644 --- a/tests/unit/test_api.py +++ b/tests/unit/test_api.py @@ -6,24 +6,25 @@ import pytest pytest.skip("skipping as dug.api is no longer present", allow_module_level=True) from pytest import mark +import pytest_asyncio from dug.api import app, main, DugResource -@pytest.fixture +@pytest_asyncio.fixture def dug_api_test_client(): with app.test_client() as client: yield client -@pytest.fixture +@pytest_asyncio.fixture def mock_g_object(): with patch('dug.api.dug') as g: yield g -@pytest.fixture +@pytest_asyncio.fixture def mock_search_concepts(mock_g_object): mock_g_object().search_concepts.return_value = {'hits': {'hits': [ {'_type': '_doc', @@ -38,21 +39,21 @@ def mock_search_concepts(mock_g_object): }} -@pytest.fixture +@pytest_asyncio.fixture def mock_search_kg(mock_g_object): mock_g_object().search_kg.return_value = {'hits': {'hits': [ {'_type': '_doc', '_id': 'MEDDRA:10047249'} ]}} -@pytest.fixture +@pytest_asyncio.fixture def mock_search_variables(mock_g_object): mock_g_object().search_variables.return_value = {'hits': {'hits': [ {'_type': '_doc', '_id': 'MEDDRA:10047249'} ]}} -@pytest.fixture +@pytest_asyncio.fixture def mock_agg_data_types(mock_g_object): mock_g_object().agg_data_type.return_value = ["DBGaP"] diff --git a/tests/unit/test_core/test_search.py b/tests/unit/test_core/test_search.py index b7edc83..db7ed75 100644 --- a/tests/unit/test_core/test_search.py +++ b/tests/unit/test_core/test_search.py @@ -3,6 +3,7 @@ from unittest.mock import patch import pytest +import pytest_asyncio from dug.core.index import Index, SearchException from dug.config import Config @@ -95,7 +96,7 @@ def search(self, index, body, **kwargs): return {"results": {k: v for k, v in values.items() if body in v}} -@pytest.fixture +@pytest_asyncio.fixture def elastic(): with patch("dug.core.index.Elasticsearch") as es_class: es_instance = MockElastic(indices=MockIndices())