-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathMakefile
178 lines (150 loc) · 8.49 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
.DEFAULT_GOAL := help
PRIVATE_PROJECT_BUCKET := $(PROJECTS_BUCKET)/$(PROJECT_NAME)
PUBLIC_PROJECT_BUCKET := datalabs-public/$(PROJECT_NAME)
PACKAGE_VERSION := $(shell venv/bin/python -c "from grants_tagger.__version__ import __version__; print(__version__)")
MESH_MODEL_PACKAGE := xlinear-$(PACKAGE_VERSION).tar.gz
MESH_MODEL := xlinear/model/
MESH_LABEL_BINARIZER := xlinear/label_binarizer.pkl
SCIENCE_TFIDF_SVM_MODEL:= tfidf-svm.pkl
SCIENCE_SCIBERT_MODEL := scibert
SCIENCE_LABEL_BINARIZER := label_binarizer.pkl
PYTHON := python3.8
VIRTUALENV := venv
PIP := $(VIRTUALENV)/bin/pip
UNAME := $(shell uname)
.PHONY: sync_data
sync_data: sync_science_data sync_mesh_data ## Sync data to and from s3
.PHONY:sync_science_data
sync_science_data: ## Sync science data to and from s3
aws s3 sync data/raw/ s3://$(PRIVATE_PROJECT_BUCKET)/data/raw/ --exclude "*allMeSH*" --exclude "*desc*" --exclude "*disease_tags*"
aws s3 sync s3://$(PRIVATE_PROJECT_BUCKET)/data/raw data/raw --exclude "*allMeSH*" --exclude "*desc*" --exclude "*disease_tags*"
.PHONY: sync_mesh_data
sync_mesh_data: ## Sync mesh data to and from s3
aws s3 sync data/raw/ s3://$(PRIVATE_PROJECT_BUCKET)/data/raw/ --exclude "*" --include "*allMeSH*" --include "*desc*" --include "*disease_tags*"
aws s3 sync s3://$(PRIVATE_PROJECT_BUCKET)/data/raw data/raw/ --exclude "*" --include "*allMeSH*" --include "*desc*" --include "*disease_tags*"
.PHONY: sync_artifacts
sync_artifacts: sync_science_artifacts sync_mesh_artifacts ## Sync processed data and models to and from s3
.PHONY: sync_science_artifacts
sync_science_artifacts: ## Sync science processed data and models
echo "Sync processed data"
aws s3 sync data/processed s3://$(PRIVATE_PROJECT_BUCKET)/data/processed --exclude "*" --include "*science*"
aws s3 sync s3://$(PRIVATE_PROJECT_BUCKET)/data/processed data/processed --exclude "*" --include "*science*"
echo "Sync models"
aws s3 sync models/ s3://$(PRIVATE_PROJECT_BUCKET)/models/ --exclude "*" --include "*science*"
aws s3 sync s3://$(PRIVATE_PROJECT_BUCKET)/models/ models/ --exclude "*" --include "*science*"
.PHONY: sync_mesh_artifacts
sync_mesh_artifacts: ## Sync mesh processed data and models
echo "Sync processed data"
aws s3 sync data/processed s3://$(PRIVATE_PROJECT_BUCKET)/data/processed --exclude "*" --include "*mesh*"
aws s3 sync s3://$(PRIVATE_PROJECT_BUCKET)/data/processed data/processed --exclude "*" --include "*mesh*"
echo "Sync models"
aws s3 sync models/ s3://$(PRIVATE_PROJECT_BUCKET)/models/ --exclude "*" --include "*mesh*"
aws s3 sync s3://$(PRIVATE_PROJECT_BUCKET)/models/ models/ --exclude "*" --include "*mesh*" --exclude "*tfidf*"
virtualenv: ## Creates virtualenv
@if [ -d $(VIRTUALENV) ]; then rm -rf $(VIRTUALENV); fi
@mkdir -p $(VIRTUALENV)
virtualenv --python $(PYTHON) $(VIRTUALENV)
$(PIP) install --upgrade pip
$(PIP) install -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cpu
$(PIP) install --no-deps -e .
$(PIP) install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_sm-0.4.0.tar.gz
update-requirements: VIRTUALENV := /tmp/update-requirements-venv/
update-requirements: ## Updates requirement
@if [ -d $(VIRTUALENV) ]; then rm -rf $(VIRTUALENV); fi
@mkdir -p $(VIRTUALENV)
virtualenv --python $(PYTHON) $(VIRTUALENV)
$(VIRTUALENV)/bin/pip install --upgrade pip
$(VIRTUALENV)/bin/pip install -r unpinned_requirements.txt
$(VIRTUALENV)/bin/pip install torch --index-url https://download.pytorch.org/whl/cpu
echo "#Created by Makefile. Do not edit." > requirements.txt
$(VIRTUALENV)/bin/pip freeze | grep -v pkg_resources==0.0.0 >> requirements.txt
virtualenv-dev: ## Creates virtualenv
@if [ -d $(VIRTUALENV) ]; then rm -rf $(VIRTUALENV); fi
@mkdir -p $(VIRTUALENV)
virtualenv --python $(PYTHON) $(VIRTUALENV)
$(PIP) install --upgrade pip
$(PIP) install -r dev_requirements.txt
$(PIP) install -e .[dev]
$(PIP) install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_sm-0.4.0.tar.gz
$(VIRTUALENV)/bin/pre-commit install --hook-type pre-push --hook-type post-checkout --hook-type pre-commit
update-requirements-dev: VIRTUALENV := /tmp/update-requirements-venv/
update-requirements-dev: ## Updates requirement
@if [ -d $(VIRTUALENV) ]; then rm -rf $(VIRTUALENV); fi
@mkdir -p $(VIRTUALENV)
virtualenv --python $(PYTHON) $(VIRTUALENV)
$(VIRTUALENV)/bin/pip install --upgrade pip
$(VIRTUALENV)/bin/pip install -r unpinned_requirements.txt -r unpinned_dev_requirements.txt
echo "#Created by Makefile. Do not edit." > dev_requirements.txt
$(VIRTUALENV)/bin/pip freeze | grep -v pkg_resources==0.0.0 | grep -v nvidia-* >> dev_requirements.txt
echo "git+https://github.com/nsorros/shap.git@dev" >> dev_requirements.txt
.PHONY: test
test: ## Run tests
$(VIRTUALENV)/bin/pip install pytest pytest-cov
$(VIRTUALENV)/bin/pytest -m inference_time --disable-warnings -v --cov=grants_tagger
# $(VIRTUALENV)/bin/tox # Tox is not needed, as it's repeating the tests
.PHONY: test-dev
test-dev: ## Run tests
$(VIRTUALENV)/bin/pytest --disable-warnings -v --cov=grants_tagger
.PHONY: build
build: ## Create wheel distribution
$(VIRTUALENV)/bin/python setup.py bdist_wheel
tar -c -z -v -f models/$(MESH_MODEL_PACKAGE) models/$(MESH_MODEL) models/$(MESH_LABEL_BINARIZER)
.PHONY: deploy
deploy: ## Deploy wheel to public s3 bucket
aws s3 cp --recursive --exclude "*" --include "*.whl" --acl public-read dist/ s3://$(PUBLIC_PROJECT_BUCKET)
echo "Deploying $(PACKAGE_VERSION)"
git tag v$(shell python setup.py --version)
git push --tags
$(VIRTUALENV)/bin/python -m twine upload --repository pypi --verbose dist/*
aws s3 cp --acl public-read models/$(MESH_MODEL_PACKAGE) s3://datalabs-public/grants_tagger/models/$(MESH_MODEL_PACKAGE)
aws s3 cp --recursive models/$(MESH_MODEL) s3://datalabs-data/grants_tagger/$(MESH_MODEL)
aws s3 cp models/$(MESH_LABEL_BINARIZER) s3://datalabs-data/grants_tagger/models/$(MESH_LABEL_BINARIZER)
aws s3 cp models/$(SCIENCE_TFIDF_SVM_MODEL) s3://datalabs-data/grants_tagger/$(SCIENCE_TFIDF_SVM_MODEL)
aws s3 cp --recursive models/$(SCIENCE_SCIBERT_MODEL) s3://datalabs-data/grants_tagger/$(SCIENCE_SCIBERT_MODEL)
aws s3 cp models/$(SCIENCE_LABEL_BINARIZER) s3://datalabs-data/grants_tagger/$(SCIENCE_LABEL_BINARIZER)
# XLinear model >2GB that GitHub accepts
# gh release upload v$(shell python setup.py --version) models/$(MESH_MODEL).tar.gz
.PHONY: clean
clean: ## Clean hidden and compiled files
find . -type f -name "*.py[co]" -delete
find . -type d -name "__pycache__" -delete
find . -type f -name "*flymake*" -delete
find . -type f -name "#*#" -delete
.PHONY: aws-docker-login
aws-docker-login:
aws ecr get-login-password --region eu-west-1 | docker login --username AWS --password-stdin $(AWS_ACCOUNT_ID).dkr.ecr.eu-west-1.amazonaws.com
.PHONY: build-docker
build-docker: ## Builds Docker container with grants_tagger
docker build -t $(ECR_IMAGE):latest -f Dockerfile .
.PHONY: push-docker
push-docker: aws-docker-login ## Pushes Docker container to ECR
docker push $(ECR_IMAGE):latest
.PHONY: build-streamlit-docker
build-streamlit-docker: ## Builds Docker with streamlit and models
# aws s3 cp --recursive s3://datalabs-data/grants_tagger/models/disease_mesh_cnn-2021.03.1/ models/disease_mesh_cnn-2021.03.1/
# aws s3 cp s3://datalabs-data/grants_tagger/models/disease_mesh_label_binarizer.pkl models/
# aws s3 cp s3://datalabs-data/grants_tagger/models/tfidf-svm-2020.05.2.pkl models/
# aws s3 cp --recursive s3://datalabs-data/grants_tagger/models/scibert-2020.05.5/ models/
# aws s3 cp s3://datalabs-data/grants_tagger/models/label_binarizer.pkl models/
aws s3 sync s3://datalabs-data/grants_tagger/models/xlinear-0.2.3/ models/xlinear-0.2.3/
docker build -t streamlitapp -f Dockerfile.streamlit .
.PHONY: install-private-requirements
install-private-requirements: ## Install the private datascience utils
pip install httpx pyodbc psycopg2
pip install -e git+ssh://[email protected]/wellcometrust/datascience.git#egg=wellcome-datascience-common
help: ## Show help message
@IFS=$$'\n' ; \
help_lines=(`fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \
printf "%s\n\n" "Usage: make [task]"; \
printf "%-20s %s\n" "task" "help" ; \
printf "%-20s %s\n" "------" "----" ; \
for help_line in $${help_lines[@]}; do \
IFS=$$':' ; \
help_split=($$help_line) ; \
help_command=`echo $${help_split[0]} | sed -e 's/^ *//' -e 's/ *$$//'` ; \
help_info=`echo $${help_split[2]} | sed -e 's/^ *//' -e 's/ *$$//'` ; \
printf '\033[36m'; \
printf "%-20s %s" $$help_command ; \
printf '\033[0m'; \
printf "%s\n" $$help_info; \
done