Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/165 sapbert annotator #334

Merged
merged 24 commits into from
Jan 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 5 additions & 6 deletions .github/workflows/code-checks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.10'
python-version: '3.12'

# Currently actions/setup-python supports caching
# but the cache is not as robust as cache action.
Expand Down Expand Up @@ -113,7 +113,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.10'
python-version: '3.12'

- name: Install Requirements
run: |
Expand All @@ -123,8 +123,7 @@ jobs:

- name: Test with pytest
run: |
pytest --doctest-modules src
coverage run -m pytest tests/unit
make test

############################ Bandit ################################
bandit:
Expand All @@ -134,7 +133,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.10'
python-version: '3.12'

- name: Install Requirements
run: |
Expand All @@ -145,4 +144,4 @@ jobs:
# Only report high security issues
- name: Test with Bandit
run: |
bandit -r src -n3 -lll
bandit -r src -n3 -lll
1 change: 1 addition & 0 deletions .github/workflows/trivy-pr-scan.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ jobs:
image-ref: '${{ github.repository }}:vuln-test'
format: 'sarif'
severity: 'CRITICAL,HIGH'
ignore-unfixed: true
output: 'trivy-results.sarif'
exit-code: '1'
# Scan results should be viewable in GitHub Security Dashboard
Expand Down
10 changes: 5 additions & 5 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,19 @@
# A container for the core semantic-search capability.
#
######################################################
FROM python:3.10.10-slim
FROM python:3.12.0-alpine3.18

# Install required packages
RUN apt-get update && \
apt-get install -y curl make vim && \
rm -rf /var/cache/apt/*
RUN apk update && \
apk add g++ make

RUN pip install --upgrade pip
# Create a non-root user.
ENV USER dug
ENV HOME /home/$USER
ENV UID 1000

RUN adduser --disabled-login --home $HOME --shell /bin/bash --uid $UID $USER
RUN adduser -D --home $HOME --uid $UID $USER

USER $USER
WORKDIR $HOME
Expand Down
2 changes: 0 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,6 @@ install.dug:

#test: Run all tests
test:
# ${PYTHON} -m flake8 src
${PYTHON} -m pytest --doctest-modules src
coverage run -m pytest tests

coverage:
Expand Down
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,13 @@ dug crawl tests/integration/data/test_variables_v1.0.csv -p "TOPMedTag"

After crawling, you can search:
```shell
dug search -q "heart attack" -t "concepts"
dug search -q "heart attack" -t "variables" -k "concept=MONDO:0005068"
dug search -q "vein" -t "concepts"
dug search -q "vein" -t "variables" -k "concept=UBERON:0001638"
```

You can also query Dug's REST API:
```shell
query="`echo '{"index" : "concepts_index", "query" : "heart attack"}'`"
query="`echo '{"index" : "concepts_index", "query" : "vein"}'`"

curl --data "$query" \
--header "Content-Type: application/json" \
Expand Down
29 changes: 14 additions & 15 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,30 +1,29 @@
aiohttp
asyncio
fastapi==0.95.0
uvicorn==0.23.2
fastapi
uvicorn
elasticsearch[async]==8.5.2
gunicorn
itsdangerous
Jinja2
jsonschema
MarkupSafe
ormar==0.12.1
mistune==2.0.3
pluggy==1.0.0
pyrsistent==0.17.3
ormar
mistune
pluggy
pyrsistent
pytest
pytz==2021.1
PyYAML==6.0
requests==2.31.0
# old redis==4.4.2
redis==4.5.1
requests-cache==0.9.8
six==1.16.0
pytz
PyYAML
requests
redis
requests-cache
six

# Click for command line arguments
# We use Click 7.0 because that's what one of the pinned packages above use.
click
httpx>=0.24.1
httpx
linkml-runtime==1.6.0
bmt==1.1.0
urllib3>=1.26.17
urllib3
10 changes: 5 additions & 5 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -17,23 +17,23 @@ classifiers =
package_dir =
= src
packages = find:
python_requires = >=3.10
python_requires = >=3.12
include_package_data = true
install_requires =
elasticsearch==8.5.2
pluggy
requests
requests_cache==0.9.8
redis==4.5.1
requests_cache
redis

[options.entry_points]
console_scripts =
dug = dug.cli:main

[options.extras_require]
rest =
fastapi==0.95.0
uvicorn==0.23.2
fastapi
uvicorn
gunicorn
jsonschema

Expand Down
9 changes: 8 additions & 1 deletion src/dug/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,13 @@ def get_argparser():
required=True
)

crawl_parser.add_argument(
'-a', '--annotator',
help='Annotator used to annotate identifiers in crawl file',
dest="annotator_type",
default="annotator-monarch"
)

crawl_parser.add_argument(
'-e', '--element-type',
help='[Optional] Coerce all elements to a certain data type (e.g. DbGaP Variable).\n'
Expand Down Expand Up @@ -108,7 +115,7 @@ def crawl(args):
config.node_to_element_queries = {}
factory = DugFactory(config)
dug = Dug(factory)
dug.crawl(args.target, args.parser_type, args.element_type)
dug.crawl(args.target, args.parser_type, args.annotator_type, args.element_type)


def search(args):
Expand Down
18 changes: 11 additions & 7 deletions src/dug/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,10 @@

from dug import hookspecs
from dug.core import parsers
from dug.core import annotators
from dug.core.factory import DugFactory
from dug.core.parsers import DugConcept, Parser, get_parser
from dug.core.annotators import DugIdentifier, Annotator, get_annotator

logger = logging.getLogger('dug')
stdout_log_handler = logging.StreamHandler(sys.stdout)
Expand All @@ -29,6 +31,7 @@ def get_plugin_manager() -> pluggy.PluginManager:
pm.add_hookspecs(hookspecs)
pm.load_setuptools_entrypoints("dug")
pm.register(parsers)
pm.register(annotators)
return pm


Expand Down Expand Up @@ -56,19 +59,20 @@ def __init__(self, factory: DugFactory):
]
)

def crawl(self, target_name: str, parser_type: str, element_type: str = None):
def crawl(self, target_name: str, parser_type: str, annotator_type: str, element_type: str = None):

pm = get_plugin_manager()
parser = get_parser(pm.hook, parser_type)
annotator = get_annotator(pm.hook, annotator_type)
targets = get_targets(target_name)

for target in targets:
self._crawl(target, parser, element_type)
self._crawl(target, parser, annotator, element_type)

def _crawl(self, target: Path, parser: Parser, element_type):
def _crawl(self, target: Path, parser: Parser, annotator: Annotator, element_type):

# Initialize crawler
crawler = self._factory.build_crawler(target, parser, element_type)
crawler = self._factory.build_crawler(target, parser, annotator, element_type)
# Read elements, annotate, and expand using tranql queries
crawler.crawl()

Expand All @@ -93,11 +97,11 @@ def search(self, target, query, **kwargs):
event_loop = asyncio.get_event_loop()
targets = {
'concepts': partial(
self._search.search_concepts, index=kwargs.get('index', self.concepts_index)),
self._search.search_concepts),
'variables': partial(
self._search.search_variables, index=kwargs.get('index', self.variables_index), concept=kwargs.pop('concept', None)),
self._search.search_variables, concept=kwargs.pop('concept', None)),
'kg': partial(
self._search.search_kg, index=kwargs.get('index', self.kg_index), unique_id=kwargs.pop('unique_id', None))
self._search.search_kg, unique_id=kwargs.pop('unique_id', None))
}
kwargs.pop('index', None)
func = targets.get(target)
Expand Down
Loading
Loading