Skip to content

Commit

Permalink
Merge pull request #74 from wellcometrust/visualize
Browse files Browse the repository at this point in the history
Add grants_tagger visualize
  • Loading branch information
nsorros authored Jul 6, 2021
2 parents 121bac8 + 82ed1ac commit 19f452f
Show file tree
Hide file tree
Showing 9 changed files with 173 additions and 35 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,6 @@ venv/**

# Logs
logs/

# Notebooks
notebooks/
19 changes: 19 additions & 0 deletions Dockerfile.streamlit
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
FROM python:3.8

WORKDIR /code

COPY grants_tagger/ /code/grants_tagger
COPY setup.py /code
COPY requirements.txt /code

RUN pip install --upgrade pip
RUN pip install -r requirements.txt
RUN pip install . --no-dependencies

COPY models/disease_mesh_cnn-2021.03.1/ models/disease_mesh_cnn-2021.03.1/
COPY models/tfidf-svm-2020.05.2.pkl models/
COPY models/scibert-2020.05.5/ models/scibert-2020.05.5/
COPY models/label_binarizer.pkl models/
COPY models/disease_mesh_label_binarizer.pkl models/

CMD ["streamlit", "run", "grants_tagger/streamlit_visualize.py"]
13 changes: 11 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,8 @@ update-requirements: ## Updates requirement
$(VIRTUALENV)/bin/pip install -r unpinned_requirements.txt
$(VIRTUALENV)/bin/pip install -r unpinned_test_requirements.txt
echo "#Created by Makefile. Do not edit." > requirements.txt
$(VIRTUALENV)/bin/pip freeze | grep -v pkg-resources==0.0.0 | grep -v wellcomeml >> requirements.txt
echo "-e git://github.com/wellcometrust/WellcomeML.git@149e6dc8e4fc4a0a6fc9006ca568e99e010ecef0#egg=wellcomeml" >> requirements.txt
$(VIRTUALENV)/bin/pip freeze | grep -v pkg-resources==0.0.0 >> requirements.txt
#echo "-e git://github.com/wellcometrust/WellcomeML.git@4e96150ff98ccbb3a12e137771fab362c02fa7f1#egg=wellcomeml" >> requirements.txt

.PHONY: test
test: ## Run tests
Expand Down Expand Up @@ -99,6 +99,15 @@ build-docker: ## Builds Docker container with grants_tagger
push-docker: aws-docker-login ## Pushes Docker container to ECR
docker push $(ECR_IMAGE):latest

.PHONY: build-streamlit-docker
build-streamlit-docker: ## Builds Docker with streamlit and models
aws s3 cp --recursive s3://datalabs-data/grants_tagger/models/disease_mesh_cnn-2021.03.1/ models/disease_mesh_cnn-2021.03.1/
aws s3 cp s3://datalabs-data/grants_tagger/models/disease_mesh_label_binarizer.pkl models/
aws s3 cp s3://datalabs-data/grants_tagger/models/tfidf-svm-2020.05.2.pkl models/
aws s3 cp --recursive s3://datalabs-data/grants_tagger/models/scibert-2020.05.5/ models/
aws s3 cp s3://datalabs-data/grants_tagger/models/label_binarizer.pkl models/
docker build -t streamlitapp -f Dockerfile.streamlit .

help: ## Show help message
@IFS=$$'\n' ; \
help_lines=(`fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \
Expand Down
14 changes: 11 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,17 +42,18 @@ Grants tagger comes with a nice CLI with the following commands

## ⌨️ Commands

| Commands | |
| ------------ | ------------------------------------------------------------ |
| Commands | |
| --------------- | ------------------------------------------------------------ |
| ⚙️ preprocess | preprocess data to use for training |
| 🔥 train | trains a new model |
| 📈 evaluate | evaluate performance of pretrained model |
| 🔮 predict | predict tags given a grant abstract using a pretrained model |
| 🔖 tag | tag grants using a pretrained model |
| 🎛 tune | tune params and threshold |
| 📚 pretrain | pretrains embeddings or language model using unlabeled data |
| ⬇️ [download] | download trained models and data from EPMC |
| ⬇️ [download] | download trained models and data from EPMC |
| 🔍 [explain] | importance of feature be it words or tfidf numbers |
| 🌐 visualize | creates a streamlit app to interactively tag grants |

in square brackets the commands that are not implemented yet

Expand Down Expand Up @@ -388,6 +389,13 @@ This command is under development. The goals is to be able to get
feature importance scores on either words or features such as tfidf
values.

### 🌐 Visualize

This command uses streamlit to create a web app in which you
can interactively tag grants while choosing the threshold
and the model to use. It currently works only with Wellcome
trained models.

# 🧑🏻‍💻 Develop

## 📖 Data
Expand Down
5 changes: 5 additions & 0 deletions grants_tagger/__main__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from typing import List, Optional
from pathlib import Path
import configparser
import subprocess
import logging
import tarfile
import tempfile
Expand Down Expand Up @@ -369,6 +370,10 @@ def explain():
# feature importance for models
pass

@app.command()
def visualize():
st_app_path = os.path.join(os.path.dirname(__file__), "streamlit_visualize.py")
subprocess.Popen(["streamlit", "run", st_app_path])

if __name__ == "__main__":
app(prog_name="grants_tagger")
43 changes: 43 additions & 0 deletions grants_tagger/streamlit_visualize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import streamlit as st
import pandas as pd

from grants_tagger.predict import predict_tags

threshold = st.sidebar.slider("Threshold", min_value=0.0, max_value=1.0, value=0.5)
text = st.text_area('Grant abstract', 'The cell is...', height=300)

models = {
"disease_mesh_cnn-2021.03.1": {
"model_path": "models/disease_mesh_cnn-2021.03.1/",
"label_binarizer_path": "models/disease_mesh_label_binarizer.pkl",
"approach": "mesh-cnn"
},
"tfidf-svm-2020.05.2": {
"model_path": "models/tfidf-svm-2020.05.2.pkl",
"label_binarizer_path": "models/label_binarizer.pkl",
"approach": "tfidf-svm"
},
"scibert-2020.05.5": {
"model_path": "models/scibert-2020.05.5/",
"label_binarizer_path": "models/label_binarizer.pkl",
"approach": "scibert"
}
}

model_option = st.sidebar.selectbox("Model", options=list(models.keys()))
model = models[model_option]

probabilities = st.sidebar.checkbox("Display probabilities")

with st.spinner('Calculating tags...'):
tags = predict_tags([text], model["model_path"], model["label_binarizer_path"],
model["approach"], probabilities=probabilities, threshold=threshold)
tags = tags[0]
st.success("Done!")

if probabilities:
tags = [{"Tag": tag, "Prob": prob} for tag, prob in tags.items() if prob > threshold]
st.table(pd.DataFrame(tags))
else:
for tag in tags:
st.button(tag)
105 changes: 77 additions & 28 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,15 +1,23 @@
#Created by Makefile. Do not edit.
absl-py==0.13.0
altair==4.1.0
appdirs==1.4.4
appnope==0.1.2
arff==0.9
argon2-cffi==20.1.0
astor==0.8.1
astunparse==1.6.3
async-generator==1.10
atpublic==2.3
attrs==21.2.0
backcall==0.2.0
base58==2.1.0
black==21.6b0
bleach==3.3.0
blinker==1.4
blis==0.7.4
boto3==1.17.95
botocore==1.20.95
boto3==1.17.105
botocore==1.20.105
cached-property==1.5.2
cachetools==4.2.2
catalogue==2.0.4
Expand All @@ -26,7 +34,9 @@ coverage==5.5
cycler==0.10.0
cymem==2.0.5
Cython==0.29.23
debugpy==1.3.0
decorator==4.4.2
defusedxml==0.7.1
dictdiffer==0.8.1
dill==0.3.4
diskcache==5.2.1
Expand All @@ -35,6 +45,7 @@ docutils==0.15
dpath==2.0.1
dulwich==0.20.23
dvc==2.1.0
entrypoints==0.3
et-xmlfile==1.1.0
filelock==3.0.12
flake8==3.9.2
Expand All @@ -48,72 +59,99 @@ future==0.18.2
gast==0.3.3
gensim==4.0.1
gitdb==4.0.7
GitPython==3.1.14
google-auth==1.31.0
GitPython==3.1.18
google-auth==1.32.1
google-auth-oauthlib==0.4.4
google-pasta==0.2.0
grandalf==0.6
grpcio==1.32.0
h5py==2.10.0
idna==2.10
importlib-metadata==4.5.0
importlib-metadata==3.10.1
iniconfig==1.1.1
ipykernel==6.0.1
ipython==7.25.0
ipython-genutils==0.2.0
ipywidgets==7.6.3
jedi==0.18.0
Jinja2==3.0.1
jmespath==0.10.0
joblib==1.0.1
jsonpath-ng==1.5.2
jsonpath-ng==1.5.3
jsonschema==3.2.0
jupyter-client==6.1.12
jupyter-core==4.7.1
jupyterlab-pygments==0.1.2
jupyterlab-widgets==1.0.0
Keras-Preprocessing==1.1.2
keyring==23.0.1
kiwisolver==1.3.1
llvmlite==0.36.0
mailchecker==4.0.8
mailchecker==4.0.10
Markdown==3.3.4
MarkupSafe==2.0.1
matplotlib==3.4.2
matplotlib-inline==0.1.2
mccabe==0.6.1
mistune==0.8.4
multiprocess==0.70.12.2
murmurhash==1.0.5
mypy-extensions==0.4.3
nanotime==0.5.2
nbclient==0.5.3
nbconvert==6.1.0
nbformat==5.1.3
nervaluate==0.1.8
nest-asyncio==1.5.1
networkx==2.5.1
nltk==3.6.2
nmslib==2.1.1
notebook==6.4.0
numba==0.53.1
numpy==1.19.2
oauthlib==3.1.1
openpyxl==3.0.7
opt-einsum==3.3.0
packaging==20.9
pandas==1.2.4
packaging==21.0
pandas==1.3.0
pandocfilters==1.4.3
parso==0.8.2
pathos==0.2.8
pathspec==0.8.1
pathy==0.5.2
phonenumbers==8.12.25
Pillow==8.2.0
pathy==0.6.0
pexpect==4.8.0
phonenumbers==8.12.26
pickleshare==0.7.5
Pillow==8.3.0
pkginfo==1.7.0
pluggy==0.13.1
ply==3.11
pox==0.3.0
ppft==1.6.6.4
preshed==3.0.5
prometheus-client==0.11.0
prompt-toolkit==3.0.19
protobuf==3.17.3
protobuf3-to-dict==0.1.5
psutil==5.8.0
ptyprocess==0.7.0
py==1.10.0
pyarrow==4.0.1
pyasn1==0.4.8
pyasn1-modules==0.2.8
pybind11==2.6.1
pycodestyle==2.7.0
pycparser==2.20
pydantic==1.7.4
pydeck==0.6.2
pydot==1.4.2
pyflakes==2.3.1
pygit2==1.6.0
pygit2==1.6.1
Pygments==2.9.0
pygtrie==2.3.2
pynndescent==0.5.2
pynndescent==0.5.3
pyparsing==2.4.7
pyrsistent==0.18.0
pysbd==0.3.4
pytest==6.2.4
pytest-cov==2.12.1
Expand All @@ -123,33 +161,36 @@ python-fsutil==0.5.0
python-slugify==5.0.2
pytz==2021.1
PyYAML==5.4.1
pyzmq==22.1.0
readme-renderer==29.0
regex==2021.4.4
regex==2021.7.6
requests==2.25.1
requests-oauthlib==1.3.0
requests-toolbelt==0.9.1
rfc3986==1.5.0
rich==10.3.0
rich==10.5.0
rsa==4.7.2
ruamel.yaml==0.17.9
ruamel.yaml.clib==0.2.2
ruamel.yaml==0.17.10
ruamel.yaml.clib==0.2.6
s3transfer==0.4.2
sacremoses==0.0.45
sagemaker==2.46.0
sagemaker==2.47.2.post0
scikit-learn==0.23.2
scikit-multilearn==0.2.0
scipy==1.4.1
scispacy==0.4.0
Send2Trash==1.7.1
shortuuid==1.0.1
shtab==1.3.6
shtab==1.3.7
six==1.15.0
smart-open==3.0.0
smart-open==5.1.0
smdebug-rulesconfig==1.0.1
smmap==4.0.0
spacy==3.0.0
spacy-legacy==3.0.6
spacy==3.0.6
spacy-legacy==3.0.7
spacy-lookups-data==1.0.2
srsly==2.4.1
streamlit==0.84.0
tabulate==0.8.9
tensorboard==2.5.0
tensorboard-data-server==0.6.1
Expand All @@ -158,29 +199,37 @@ tensorflow==2.4.0
tensorflow-addons==0.13.0
tensorflow-estimator==2.4.0
termcolor==1.1.0
terminado==0.10.1
testpath==0.5.0
text-unidecode==1.3
thinc==8.0.5
thinc==8.0.7
threadpoolctl==2.1.0
tokenizers==0.10.1
toml==0.10.2
toolz==0.11.1
torch==1.9.0
tqdm==4.61.1
tornado==6.1
tqdm==4.61.2
traitlets==5.0.5
transformers==4.3.0
twine==3.4.1
typed-ast==1.4.3
typeguard==2.12.1
typer==0.3.2
typing-extensions==3.7.4.3
tzlocal==2.1
umap-learn==0.5.1
urllib3==1.26.5
urllib3==1.26.6
validators==0.18.2
voluptuous==0.12.1
wasabi==0.8.2
wcwidth==0.2.5
webencodings==0.5.1
wellcomeml==1.1.0
Werkzeug==2.0.1
widgetsnbextension==3.5.1
wrapt==1.12.1
xlrd==2.0.1
xmltodict==0.12.0
zc.lockfile==2.0
zipp==3.4.1
-e git://github.com/wellcometrust/WellcomeML.git@149e6dc8e4fc4a0a6fc9006ca568e99e010ecef0#egg=wellcomeml
zipp==3.5.0
Loading

0 comments on commit 19f452f

Please sign in to comment.