Skip to content

Commit

Permalink
Add support for ONNX AI models
Browse files Browse the repository at this point in the history
  • Loading branch information
antas-marcin committed Nov 30, 2023
1 parent b3c001b commit 2436eeb
Show file tree
Hide file tree
Showing 14 changed files with 200 additions and 91 deletions.
8 changes: 8 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,10 @@
__pycache__
.github
.venv
.vscode
cicd
models
nltk_data
smoke_test.py
test_app.py
requirements-test.txt
40 changes: 38 additions & 2 deletions .github/workflows/main.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,54 +20,90 @@ jobs:
include:
- model_name: distilbert-base-uncased
model_tag_name: distilbert-base-uncased
onnx_runtime: false
- model_name: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2
model_tag_name: sentence-transformers-paraphrase-multilingual-MiniLM-L12-v2
onnx_runtime: false
- model_name: sentence-transformers/multi-qa-MiniLM-L6-cos-v1
model_tag_name: sentence-transformers-multi-qa-MiniLM-L6-cos-v1
onnx_runtime: false
- model_name: sentence-transformers/multi-qa-mpnet-base-cos-v1
model_tag_name: sentence-transformers-multi-qa-mpnet-base-cos-v1
onnx_runtime: false
- model_name: sentence-transformers/all-mpnet-base-v2
model_tag_name: sentence-transformers-all-mpnet-base-v2
onnx_runtime: false
- model_name: sentence-transformers/all-MiniLM-L12-v2
model_tag_name: sentence-transformers-all-MiniLM-L12-v2
onnx_runtime: false
- model_name: sentence-transformers/paraphrase-multilingual-mpnet-base-v2
model_tag_name: sentence-transformers-paraphrase-multilingual-mpnet-base-v2
onnx_runtime: false
- model_name: sentence-transformers/all-MiniLM-L6-v2
model_tag_name: sentence-transformers-all-MiniLM-L6-v2
onnx_runtime: false
- model_name: sentence-transformers/multi-qa-distilbert-cos-v1
model_tag_name: sentence-transformers-multi-qa-distilbert-cos-v1
onnx_runtime: false
- model_name: sentence-transformers/gtr-t5-base
model_tag_name: sentence-transformers-gtr-t5-base
onnx_runtime: false
- model_name: sentence-transformers/gtr-t5-large
model_tag_name: sentence-transformers-gtr-t5-large
onnx_runtime: false
- model_name: sentence-transformers/sentence-t5-base
model_tag_name: sentence-transformers-sentence-t5-base
onnx_runtime: false
- model_name: vblagoje/dpr-ctx_encoder-single-lfqa-wiki
model_tag_name: vblagoje-dpr-ctx_encoder-single-lfqa-wiki
onnx_runtime: false
- model_name: vblagoje/dpr-question_encoder-single-lfqa-wiki
model_tag_name: vblagoje-dpr-question_encoder-single-lfqa-wiki
onnx_runtime: false
- model_name: facebook/dpr-ctx_encoder-single-nq-base
model_tag_name: facebook-dpr-ctx_encoder-single-nq-base
onnx_runtime: false
- model_name: facebook/dpr-question_encoder-single-nq-base
model_tag_name: facebook-dpr-question_encoder-single-nq-base
onnx_runtime: false
- model_name: google/flan-t5-base
model_tag_name: google-flan-t5-base
onnx_runtime: false
- model_name: google/flan-t5-large
model_tag_name: google-flan-t5-large
onnx_runtime: false
- model_name: biu-nlp/abstract-sim-sentence
model_tag_name: biu-nlp-abstract-sim-sentence
onnx_runtime: false
- model_name: biu-nlp/abstract-sim-query
model_tag_name: biu-nlp-abstract-sim-query
onnx_runtime: false
- model_name: BAAI/bge-small-en
model_tag_name: baai-bge-small-en
onnx_runtime: true
- model_name: BAAI/bge-small-en-v1.5
model_tag_name: baai-bge-small-en-v1.5
onnx_runtime: true
- model_name: BAAI/bge-base-en
model_tag_name: baai-bge-base-en
onnx_runtime: true
- model_name: BAAI/bge-base-en-v1.5
model_tag_name: baai-bge-base-en-v1.5
onnx_runtime: true
- model_name: sentence-transformers/all-MiniLM-L6-v2
model_tag_name: sentence-transformers-all-MiniLM-L6-v2
onnx_runtime: true
env:
LOCAL_REPO: transformers-inference
REMOTE_REPO: semitechnologies/transformers-inference
MODEL_NAME: ${{matrix.model_name}}
MODEL_TAG_NAME: ${{matrix.model_tag_name}}
ONNX_RUNTIME: ${{matrix.onnx_runtime}}
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
python-version: "3.10"
python-version: "3.11"
cache: 'pip' # caching pip dependencies
- name: Login to Docker Hub
if: ${{ !github.event.pull_request.head.repo.fork }} # no PRs from fork
Expand Down Expand Up @@ -96,7 +132,7 @@ jobs:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
python-version: "3.10"
python-version: "3.11"
- name: Login to Docker Hub
if: ${{ !github.event.pull_request.head.repo.fork }} # no PRs from fork
uses: docker/login-action@v2
Expand Down
5 changes: 4 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM python:3.10-slim
FROM python:3.11-slim

WORKDIR /app

Expand All @@ -8,7 +8,10 @@ RUN pip install --upgrade pip setuptools
COPY requirements.txt .
RUN pip3 install -r requirements.txt

ARG TARGETARCH
ARG MODEL_NAME
ARG ONNX_RUNTIME
ENV ONNX_CPU=${TARGETARCH}
COPY download.py .
RUN ./download.py

Expand Down
19 changes: 14 additions & 5 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,16 +43,25 @@ def startup_event():
if transformers_direct_tokenize is not None and transformers_direct_tokenize == "true" or transformers_direct_tokenize == "1":
direct_tokenize = True

model_dir = "./models/model"
def get_model_directory() -> str:
if os.path.exists("./models/model/model_name"):
with open("./models/model/model_name", "r") as f:
if os.path.exists(f"{model_dir}/model_name"):
with open(f"{model_dir}/model_name", "r") as f:
model_name = f.read()
return f"./models/model/{model_name}"
return "./models/model"
return f"{model_dir}/{model_name}"
return model_dir

def get_onnx_runtime() -> bool:
if os.path.exists(f"{model_dir}/onnx_runtime"):
with open(f"{model_dir}/onnx_runtime", "r") as f:
onnx_runtime = f.read()
return onnx_runtime == "true"
return False

meta_config = Meta(get_model_directory())
vec = Vectorizer(get_model_directory(), cuda_support, cuda_core, cuda_per_process_memory_fraction,
meta_config.getModelType(), meta_config.get_architecture(), direct_tokenize)
meta_config.get_model_type(), meta_config.get_architecture(),
direct_tokenize, get_onnx_runtime())


@app.get("/.well-known/live", response_class=Response)
Expand Down
6 changes: 5 additions & 1 deletion cicd/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,9 @@ set -eou pipefail

local_repo=${LOCAL_REPO?Variable LOCAL_REPO is required}
model_name=${MODEL_NAME?Variable MODEL_NAME is required}
onnx_runtime=${ONNX_RUNTIME?Variable ONNX_RUNTIME is required}

docker build --build-arg "MODEL_NAME=$model_name" -t "$local_repo" .
docker build \
--build-arg "MODEL_NAME=$model_name" \
--build-arg "ONNX_RUNTIME=$onnx_runtime" \
-t "$local_repo" .
21 changes: 10 additions & 11 deletions cicd/docker_push.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,11 @@

set -eou pipefail

# Docker push rules
# If on tag (e.g. 1.0.0)
# - any commit is pushed as :<model>-<semver>
# - any commit is pushed as :<model>-latest
# - any commit is pushed as :<model>
git_hash=
remote_repo=${REMOTE_REPO?Variable REMOTE_REPO is required}
model_name=${MODEL_NAME?Variable MODEL_NAME is required}
docker_username=${DOCKER_USERNAME?Variable DOCKER_USERNAME is required}
docker_password=${DOCKER_PASSWORD?Variable DOCKER_PASSWORD is required}
onnx_runtime=${ONNX_RUNTIME?Variable ONNX_RUNTIME is required}
original_model_name=$model_name
git_tag=$GITHUB_REF_NAME

Expand All @@ -20,6 +15,7 @@ function main() {
echo "git ref type is $GITHUB_REF_TYPE"
echo "git ref name is $GITHUB_REF_NAME"
echo "git tag is $git_tag"
echo "onnx_runtime is $onnx_runtime"
push_tag
}

Expand All @@ -31,22 +27,25 @@ function init() {
model_name="$MODEL_TAG_NAME"
fi

git_hash="$(git rev-parse HEAD | head -c 7)"

docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
docker buildx create --use
echo "$docker_password" | docker login -u "$docker_username" --password-stdin
}

function push_tag() {
if [ ! -z "$git_tag" ] && [ "$GITHUB_REF_TYPE" == "tag" ]; then
tag_git="$remote_repo:$model_name-$git_tag"
tag_latest="$remote_repo:$model_name-latest"
tag="$remote_repo:$model_name"
model_name_part=$model_name
if [ "$onnx_runtime" == "true" ]; then
model_name_part="$model_name-onnx"
fi
tag_git="$remote_repo:$model_name_part-$git_tag"
tag_latest="$remote_repo:$model_name_part-latest"
tag="$remote_repo:$model_name_part"

echo "Tag & Push $tag, $tag_latest, $tag_git"
docker buildx build --platform=linux/arm64,linux/amd64 \
--build-arg "MODEL_NAME=$original_model_name" \
--build-arg "ONNX_RUNTIME=$onnx_runtime" \
--push \
--tag "$tag_git" \
--tag "$tag_latest" \
Expand Down
18 changes: 0 additions & 18 deletions cicd/markdown_table_from_api.py

This file was deleted.

24 changes: 0 additions & 24 deletions cicd/travis_yml_to_markdown_table.py

This file was deleted.

2 changes: 1 addition & 1 deletion custom.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM python:3.10-slim
FROM python:3.11-slim

WORKDIR /app

Expand Down
96 changes: 74 additions & 22 deletions download.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@
AutoConfig,
)
from sentence_transformers import SentenceTransformer
from optimum.onnxruntime import ORTModelForFeatureExtraction
from optimum.onnxruntime.configuration import AutoQuantizationConfig
from optimum.onnxruntime import ORTQuantizer
from pathlib import Path


model_dir = './models/model'
Expand All @@ -22,30 +26,78 @@
if force_automodel:
print(f"Using AutoModel for {model_name} to instantiate model")

print(f"Downloading model {model_name} from huggingface model hub")
config = AutoConfig.from_pretrained(model_name)
model_type = config.to_dict()['model_type']
onnx_runtime = os.getenv('ONNX_RUNTIME')
if not onnx_runtime:
onnx_runtime = "false"

if model_type is not None and model_type == "t5":
SentenceTransformer(model_name, cache_folder=model_dir)
with open(f"{model_dir}/model_name", "w") as f:
f.write(model_name.replace("/", "_"))
else:
if config.architectures and not force_automodel:
print(f"Using class {config.architectures[0]} to load model weights")
mod = __import__('transformers', fromlist=[config.architectures[0]])
try:
klass_architecture = getattr(mod, config.architectures[0])
model = klass_architecture.from_pretrained(model_name)
except AttributeError:
print(f"{config.architectures[0]} not found in transformers, fallback to AutoModel")
model = AutoModel.from_pretrained(model_name)
else:
model = AutoModel.from_pretrained(model_name)
onnx_cpu_arch = os.getenv('ONNX_CPU')
if not onnx_cpu_arch:
onnx_cpu_arch = "arm64"

print(f"Downloading MODEL_NAME={model_name} with FORCE_AUTOMODEL={force_automodel} ONNX_RUNTIME={onnx_runtime} ONNX_CPU={onnx_cpu_arch}")

def download_onnx_model(model_name: str, model_dir: str):
# Download model and tokenizer
onnx_path = Path(model_dir)
ort_model = ORTModelForFeatureExtraction.from_pretrained(model_name, from_transformers=True)
# Save model
ort_model.save_pretrained(onnx_path)

def quantization_config(onnx_cpu_arch: str):
if onnx_cpu_arch.lower() == "avx512_vnni":
print("Quantize Model for x86_64 (amd64) (avx512_vnni)")
return AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=False)
if onnx_cpu_arch.lower() == "arm64":
print(f"Quantize Model for ARM64")
return AutoQuantizationConfig.arm64(is_static=False, per_channel=False)
# default is AMD64
print(f"Quantize Model for x86_64 (amd64) (AVX2)")
return AutoQuantizationConfig.avx2(is_static=False, per_channel=False)

# Quantize the model / convert to ONNX
qconfig = quantization_config(onnx_cpu_arch)
quantizer = ORTQuantizer.from_pretrained(ort_model)
# Apply dynamic quantization on the model
quantizer.quantize(save_dir=onnx_path, quantization_config=qconfig)
# Remove model.onnx file, leave only model_quantized.onnx
if os.path.isfile(f"{model_dir}/model.onnx"):
os.remove(f"{model_dir}/model.onnx")
# Save information about ONNX runtime
with open(f"{model_dir}/onnx_runtime", "w") as f:
f.write(onnx_runtime)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.save_pretrained(onnx_path)

model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)
def download_model(model_name: str, model_dir: str):
print(f"Downloading model {model_name} from huggingface model hub")
config = AutoConfig.from_pretrained(model_name)
model_type = config.to_dict()['model_type']

nltk.download('punkt', download_dir='./nltk_data')
if model_type is not None and model_type == "t5":
SentenceTransformer(model_name, cache_folder=model_dir)
with open(f"{model_dir}/model_name", "w") as f:
f.write(model_name.replace("/", "_"))
else:
if config.architectures and not force_automodel:
print(f"Using class {config.architectures[0]} to load model weights")
mod = __import__('transformers', fromlist=[config.architectures[0]])
try:
klass_architecture = getattr(mod, config.architectures[0])
model = klass_architecture.from_pretrained(model_name)
except AttributeError:
print(f"{config.architectures[0]} not found in transformers, fallback to AutoModel")
model = AutoModel.from_pretrained(model_name)
else:
model = AutoModel.from_pretrained(model_name)

tokenizer = AutoTokenizer.from_pretrained(model_name)

model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)

nltk.download('punkt', download_dir='./nltk_data')

if onnx_runtime == "true":
download_onnx_model(model_name, model_dir)
else:
download_model(model_name, model_dir)
2 changes: 1 addition & 1 deletion meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def get(self):
'model': self.config.to_dict()
}

def getModelType(self):
def get_model_type(self):
return self.config.to_dict()['model_type']

def get_architecture(self):
Expand Down
Loading

0 comments on commit 2436eeb

Please sign in to comment.