wellcometrust · nsorros · Jul 6, 2021 · Jun 28, 2021 · Jun 30, 2021 · Jul 1, 2021
diff --git a/.gitignore b/.gitignore
@@ -29,3 +29,6 @@ venv/**
 
 # Logs
 logs/
+
+# Notebooks
+notebooks/
diff --git a/Dockerfile.streamlit b/Dockerfile.streamlit
@@ -0,0 +1,19 @@
+FROM python:3.8
+
+WORKDIR /code
+
+COPY grants_tagger/ /code/grants_tagger
+COPY setup.py /code
+COPY requirements.txt /code
+
+RUN pip install --upgrade pip
+RUN pip install -r requirements.txt
+RUN pip install . --no-dependencies
+
+COPY models/disease_mesh_cnn-2021.03.1/ models/disease_mesh_cnn-2021.03.1/
+COPY models/tfidf-svm-2020.05.2.pkl models/
+COPY models/scibert-2020.05.5/ models/scibert-2020.05.5/
+COPY models/label_binarizer.pkl models/
+COPY models/disease_mesh_label_binarizer.pkl models/
+
+CMD ["streamlit", "run", "grants_tagger/streamlit_visualize.py"]
diff --git a/Makefile b/Makefile
@@ -61,8 +61,8 @@ update-requirements: ## Updates requirement
 	$(VIRTUALENV)/bin/pip install -r unpinned_requirements.txt
 	$(VIRTUALENV)/bin/pip install -r unpinned_test_requirements.txt
 	echo "#Created by Makefile. Do not edit." > requirements.txt
-	$(VIRTUALENV)/bin/pip freeze | grep -v pkg-resources==0.0.0 | grep -v wellcomeml >> requirements.txt
-	echo "-e git://github.com/wellcometrust/WellcomeML.git@149e6dc8e4fc4a0a6fc9006ca568e99e010ecef0#egg=wellcomeml" >> requirements.txt
+	$(VIRTUALENV)/bin/pip freeze | grep -v pkg-resources==0.0.0 >> requirements.txt
+	#echo "-e git://github.com/wellcometrust/WellcomeML.git@4e96150ff98ccbb3a12e137771fab362c02fa7f1#egg=wellcomeml" >> requirements.txt
 
 .PHONY: test
 test: ## Run tests
@@ -99,6 +99,15 @@ build-docker: ## Builds Docker container with grants_tagger
 push-docker: aws-docker-login ## Pushes Docker container to ECR
 	docker push $(ECR_IMAGE):latest
 
+.PHONY: build-streamlit-docker
+build-streamlit-docker: ## Builds Docker with streamlit and models
+	aws s3 cp --recursive s3://datalabs-data/grants_tagger/models/disease_mesh_cnn-2021.03.1/ models/disease_mesh_cnn-2021.03.1/
+	aws s3 cp s3://datalabs-data/grants_tagger/models/disease_mesh_label_binarizer.pkl models/
+	aws s3 cp s3://datalabs-data/grants_tagger/models/tfidf-svm-2020.05.2.pkl models/
+	aws s3 cp --recursive s3://datalabs-data/grants_tagger/models/scibert-2020.05.5/ models/
+	aws s3 cp s3://datalabs-data/grants_tagger/models/label_binarizer.pkl models/
+	docker build -t streamlitapp -f Dockerfile.streamlit .
+
 help: ## Show help message
 	@IFS=$$'\n' ; \
 	help_lines=(`fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \

diff --git a/README.md b/README.md
@@ -42,17 +42,18 @@ Grants tagger comes with a nice CLI with the following commands
 
 ## ⌨️  Commands
 
-| Commands     |                                                              |
-| ------------ | ------------------------------------------------------------ |
+| Commands        |                                                              |
+| --------------- | ------------------------------------------------------------ |
 | ⚙️  preprocess   | preprocess data to use for training                          |
 | 🔥 train        | trains a new model                                           |
 | 📈 evaluate     | evaluate performance of pretrained model                     |
 | 🔮 predict      | predict tags given a grant abstract using a pretrained model |
 | 🔖 tag          | tag grants using a pretrained model                          |
 | 🎛 tune         | tune params and threshold                                    |
 | 📚 pretrain     | pretrains embeddings or language model using unlabeled data  |
-| ⬇️ [download]   | download trained models and data from EPMC                   |
+| ⬇️ [download]    | download trained models and data from EPMC                   |
 | 🔍 [explain]    | importance of feature be it words or tfidf numbers           |
+| 🌐 visualize    | creates a streamlit app to interactively tag grants          |
 
 in square brackets the commands that are not implemented yet
 
@@ -388,6 +389,13 @@ This command is under development. The goals is to be able to get
 feature importance scores on either words or features such as tfidf
 values.
 
+### 🌐 Visualize
+
+This command uses streamlit to create a web app in which you
+can interactively tag grants while choosing the threshold
+and the model to use. It currently works only with Wellcome
+trained models.
+
 # 🧑🏻‍💻  Develop
 
 ## 📖 Data

diff --git a/grants_tagger/__main__.py b/grants_tagger/__main__.py
@@ -1,6 +1,7 @@
 from typing import List, Optional
 from pathlib import Path
 import configparser
+import subprocess
 import logging
 import tarfile
 import tempfile
@@ -369,6 +370,10 @@ def explain():
     # feature importance for models
     pass
 
+@app.command()
+def visualize():
+    st_app_path = os.path.join(os.path.dirname(__file__), "streamlit_visualize.py")
+    subprocess.Popen(["streamlit", "run", st_app_path])
 
 if __name__ == "__main__":
     app(prog_name="grants_tagger")
diff --git a/grants_tagger/streamlit_visualize.py b/grants_tagger/streamlit_visualize.py
@@ -0,0 +1,43 @@
+import streamlit as st
+import pandas as pd
+
+from grants_tagger.predict import predict_tags
+
+threshold = st.sidebar.slider("Threshold", min_value=0.0, max_value=1.0, value=0.5)
+text = st.text_area('Grant abstract', 'The cell is...', height=300)
+
+models = {
+    "disease_mesh_cnn-2021.03.1": {
+        "model_path": "models/disease_mesh_cnn-2021.03.1/",
+        "label_binarizer_path": "models/disease_mesh_label_binarizer.pkl",
+        "approach": "mesh-cnn"
+    },
+    "tfidf-svm-2020.05.2": {
+        "model_path": "models/tfidf-svm-2020.05.2.pkl",
+        "label_binarizer_path": "models/label_binarizer.pkl",
+        "approach": "tfidf-svm"
+    },
+    "scibert-2020.05.5": {
+        "model_path": "models/scibert-2020.05.5/",
+        "label_binarizer_path": "models/label_binarizer.pkl",
+        "approach": "scibert"
+    }
+}
+
+model_option = st.sidebar.selectbox("Model", options=list(models.keys()))
+model = models[model_option]
+
+probabilities = st.sidebar.checkbox("Display probabilities")
+
+with st.spinner('Calculating tags...'):
+    tags = predict_tags([text], model["model_path"], model["label_binarizer_path"],
+        model["approach"], probabilities=probabilities, threshold=threshold)
+    tags = tags[0]
+st.success("Done!")
+
+if probabilities:
+    tags = [{"Tag": tag, "Prob": prob} for tag, prob in tags.items() if prob > threshold]
+    st.table(pd.DataFrame(tags))
+else:
+    for tag in tags:
+        st.button(tag)
diff --git a/requirements.txt b/requirements.txt
@@ -1,15 +1,23 @@
 #Created by Makefile. Do not edit.
 absl-py==0.13.0
+altair==4.1.0
 appdirs==1.4.4
+appnope==0.1.2
 arff==0.9
+argon2-cffi==20.1.0
+astor==0.8.1
 astunparse==1.6.3
+async-generator==1.10
 atpublic==2.3
 attrs==21.2.0
+backcall==0.2.0
+base58==2.1.0
 black==21.6b0
 bleach==3.3.0
+blinker==1.4
 blis==0.7.4
-boto3==1.17.95
-botocore==1.20.95
+boto3==1.17.105
+botocore==1.20.105
 cached-property==1.5.2
 cachetools==4.2.2
 catalogue==2.0.4
@@ -26,7 +34,9 @@ coverage==5.5
 cycler==0.10.0
 cymem==2.0.5
 Cython==0.29.23
+debugpy==1.3.0
 decorator==4.4.2
+defusedxml==0.7.1
 dictdiffer==0.8.1
 dill==0.3.4
 diskcache==5.2.1
@@ -35,6 +45,7 @@ docutils==0.15
 dpath==2.0.1
 dulwich==0.20.23
 dvc==2.1.0
+entrypoints==0.3
 et-xmlfile==1.1.0
 filelock==3.0.12
 flake8==3.9.2
@@ -48,72 +59,99 @@ future==0.18.2
 gast==0.3.3
 gensim==4.0.1
 gitdb==4.0.7
-GitPython==3.1.14
-google-auth==1.31.0
+GitPython==3.1.18
+google-auth==1.32.1
 google-auth-oauthlib==0.4.4
 google-pasta==0.2.0
 grandalf==0.6
 grpcio==1.32.0
 h5py==2.10.0
 idna==2.10
-importlib-metadata==4.5.0
+importlib-metadata==3.10.1
 iniconfig==1.1.1
+ipykernel==6.0.1
+ipython==7.25.0
+ipython-genutils==0.2.0
+ipywidgets==7.6.3
+jedi==0.18.0
 Jinja2==3.0.1
 jmespath==0.10.0
 joblib==1.0.1
-jsonpath-ng==1.5.2
+jsonpath-ng==1.5.3
+jsonschema==3.2.0
+jupyter-client==6.1.12
+jupyter-core==4.7.1
+jupyterlab-pygments==0.1.2
+jupyterlab-widgets==1.0.0
 Keras-Preprocessing==1.1.2
 keyring==23.0.1
 kiwisolver==1.3.1
 llvmlite==0.36.0
-mailchecker==4.0.8
+mailchecker==4.0.10
 Markdown==3.3.4
 MarkupSafe==2.0.1
 matplotlib==3.4.2
+matplotlib-inline==0.1.2
 mccabe==0.6.1
+mistune==0.8.4
 multiprocess==0.70.12.2
 murmurhash==1.0.5
 mypy-extensions==0.4.3
 nanotime==0.5.2
+nbclient==0.5.3
+nbconvert==6.1.0
+nbformat==5.1.3
 nervaluate==0.1.8
+nest-asyncio==1.5.1
 networkx==2.5.1
 nltk==3.6.2
 nmslib==2.1.1
+notebook==6.4.0
 numba==0.53.1
 numpy==1.19.2
 oauthlib==3.1.1
 openpyxl==3.0.7
 opt-einsum==3.3.0
-packaging==20.9
-pandas==1.2.4
+packaging==21.0
+pandas==1.3.0
+pandocfilters==1.4.3
+parso==0.8.2
 pathos==0.2.8
 pathspec==0.8.1
-pathy==0.5.2
-phonenumbers==8.12.25
-Pillow==8.2.0
+pathy==0.6.0
+pexpect==4.8.0
+phonenumbers==8.12.26
+pickleshare==0.7.5
+Pillow==8.3.0
 pkginfo==1.7.0
 pluggy==0.13.1
 ply==3.11
 pox==0.3.0
 ppft==1.6.6.4
 preshed==3.0.5
+prometheus-client==0.11.0
+prompt-toolkit==3.0.19
 protobuf==3.17.3
 protobuf3-to-dict==0.1.5
 psutil==5.8.0
+ptyprocess==0.7.0
 py==1.10.0
+pyarrow==4.0.1
 pyasn1==0.4.8
 pyasn1-modules==0.2.8
 pybind11==2.6.1
 pycodestyle==2.7.0
 pycparser==2.20
 pydantic==1.7.4
+pydeck==0.6.2
 pydot==1.4.2
 pyflakes==2.3.1
-pygit2==1.6.0
+pygit2==1.6.1
 Pygments==2.9.0
 pygtrie==2.3.2
-pynndescent==0.5.2
+pynndescent==0.5.3
 pyparsing==2.4.7
+pyrsistent==0.18.0
 pysbd==0.3.4
 pytest==6.2.4
 pytest-cov==2.12.1
@@ -123,33 +161,36 @@ python-fsutil==0.5.0
 python-slugify==5.0.2
 pytz==2021.1
 PyYAML==5.4.1
+pyzmq==22.1.0
 readme-renderer==29.0
-regex==2021.4.4
+regex==2021.7.6
 requests==2.25.1
 requests-oauthlib==1.3.0
 requests-toolbelt==0.9.1
 rfc3986==1.5.0
-rich==10.3.0
+rich==10.5.0
 rsa==4.7.2
-ruamel.yaml==0.17.9
-ruamel.yaml.clib==0.2.2
+ruamel.yaml==0.17.10
+ruamel.yaml.clib==0.2.6
 s3transfer==0.4.2
 sacremoses==0.0.45
-sagemaker==2.46.0
+sagemaker==2.47.2.post0
 scikit-learn==0.23.2
 scikit-multilearn==0.2.0
 scipy==1.4.1
 scispacy==0.4.0
+Send2Trash==1.7.1
 shortuuid==1.0.1
-shtab==1.3.6
+shtab==1.3.7
 six==1.15.0
-smart-open==3.0.0
+smart-open==5.1.0
 smdebug-rulesconfig==1.0.1
 smmap==4.0.0
-spacy==3.0.0
-spacy-legacy==3.0.6
+spacy==3.0.6
+spacy-legacy==3.0.7
 spacy-lookups-data==1.0.2
 srsly==2.4.1
+streamlit==0.84.0
 tabulate==0.8.9
 tensorboard==2.5.0
 tensorboard-data-server==0.6.1
@@ -158,29 +199,37 @@ tensorflow==2.4.0
 tensorflow-addons==0.13.0
 tensorflow-estimator==2.4.0
 termcolor==1.1.0
+terminado==0.10.1
+testpath==0.5.0
 text-unidecode==1.3
-thinc==8.0.5
+thinc==8.0.7
 threadpoolctl==2.1.0
 tokenizers==0.10.1
 toml==0.10.2
+toolz==0.11.1
 torch==1.9.0
-tqdm==4.61.1
+tornado==6.1
+tqdm==4.61.2
+traitlets==5.0.5
 transformers==4.3.0
 twine==3.4.1
 typed-ast==1.4.3
 typeguard==2.12.1
 typer==0.3.2
 typing-extensions==3.7.4.3
+tzlocal==2.1
 umap-learn==0.5.1
-urllib3==1.26.5
+urllib3==1.26.6
+validators==0.18.2
 voluptuous==0.12.1
 wasabi==0.8.2
 wcwidth==0.2.5
 webencodings==0.5.1
+wellcomeml==1.1.0
 Werkzeug==2.0.1
+widgetsnbextension==3.5.1
 wrapt==1.12.1
 xlrd==2.0.1
 xmltodict==0.12.0
 zc.lockfile==2.0
-zipp==3.4.1
--e git://github.com/wellcometrust/WellcomeML.git@149e6dc8e4fc4a0a6fc9006ca568e99e010ecef0#egg=wellcomeml
+zipp==3.5.0
-Original file line number
+Diff line change
@@ Expand Up / @@ -29,3 +29,6 @@ venv/** @@
     # Logs
     logs/
+    # Notebooks
+    notebooks/