From 18873d324efa53eee76dc711d87e80474df0af43 Mon Sep 17 00:00:00 2001 From: Marcin Antas Date: Sat, 5 Oct 2024 21:07:14 +0200 Subject: [PATCH] Add support for nomic-ai/nomic-embed-text-v1.5 model --- .github/workflows/main.yaml | 5 +++ Dockerfile | 2 + app.py | 24 ++++++++++-- cicd/build.sh | 4 ++ cicd/docker_push.sh | 5 +++ download.py | 23 ++++++++++-- meta.py | 15 +++++--- smoke_test.py | 10 ++++- vectorizer.py | 74 +++++++++++++++++++++++++------------ 9 files changed, 124 insertions(+), 38 deletions(-) diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 39c3ed9..564449f 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -133,6 +133,10 @@ jobs: - model_name: Snowflake/snowflake-arctic-embed-m-v1.5 model_tag_name: snowflake-snowflake-arctic-embed-m-v1.5 onnx_runtime: true + - model_name: nomic-ai/nomic-embed-text-v1.5 + model_tag_name: nomic-ai-nomic-embed-text-v1.5 + use_sentence_transformers_vectorizer: true + trust_remote_code: true env: LOCAL_REPO: transformers-inference REMOTE_REPO: semitechnologies/transformers-inference @@ -140,6 +144,7 @@ jobs: MODEL_TAG_NAME: ${{matrix.model_tag_name}} ONNX_RUNTIME: ${{matrix.onnx_runtime}} USE_SENTENCE_TRANSFORMERS_VECTORIZER: ${{matrix.use_sentence_transformers_vectorizer}} + TRUST_REMOTE_CODE: ${{matrix.trust_remote_code}} steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 diff --git a/Dockerfile b/Dockerfile index 0fb30f5..2857bba 100644 --- a/Dockerfile +++ b/Dockerfile @@ -16,6 +16,8 @@ ARG TARGETARCH ARG MODEL_NAME ARG ONNX_RUNTIME ENV ONNX_CPU=${TARGETARCH} +ARG TRUST_REMOTE_CODE +ARG USE_SENTENCE_TRANSFORMERS_VECTORIZER RUN mkdir nltk_data COPY download.py . RUN ./download.py diff --git a/app.py b/app.py index ed593d4..94373df 100644 --- a/app.py +++ b/app.py @@ -1,6 +1,8 @@ import os from logging import getLogger from fastapi import FastAPI, Response, status +from typing import Union +from config import TRUST_REMOTE_CODE from vectorizer import Vectorizer, VectorInput from meta import Meta @@ -55,7 +57,7 @@ def startup_event(): model_dir = "./models/model" - def get_model_directory() -> (str, bool): + def get_model_name() -> Union[str, bool]: if os.path.exists(f"{model_dir}/model_name"): with open(f"{model_dir}/model_name", "r") as f: model_name = f.read() @@ -70,6 +72,13 @@ def get_onnx_runtime() -> bool: return onnx_runtime == "true" return False + def get_trust_remote_code() -> bool: + if os.path.exists(f"{model_dir}/trust_remote_code"): + with open(f"{model_dir}/trust_remote_code", "r") as f: + trust_remote_code = f.read() + return trust_remote_code == "true" + return TRUST_REMOTE_CODE + def log_info_about_onnx(onnx_runtime: bool): if onnx_runtime: onnx_quantization_info = "missing" @@ -80,11 +89,17 @@ def log_info_about_onnx(onnx_runtime: bool): f"Running ONNX vectorizer with quantized model for {onnx_quantization_info}" ) - model_name, use_sentence_transformer_vectorizer = get_model_directory() + model_name, use_sentence_transformer_vectorizer = get_model_name() onnx_runtime = get_onnx_runtime() + trust_remote_code = get_trust_remote_code() log_info_about_onnx(onnx_runtime) - meta_config = Meta(model_dir, model_name, use_sentence_transformer_vectorizer) + meta_config = Meta( + model_dir, + model_name, + use_sentence_transformer_vectorizer, + trust_remote_code, + ) vec = Vectorizer( model_dir, cuda_support, @@ -96,6 +111,7 @@ def log_info_about_onnx(onnx_runtime: bool): onnx_runtime, use_sentence_transformer_vectorizer, model_name, + trust_remote_code, ) @@ -112,7 +128,7 @@ def meta(): @app.post("/vectors") @app.post("/vectors/") -async def read_item(item: VectorInput, response: Response): +async def vectorize(item: VectorInput, response: Response): try: vector = await vec.vectorize(item.text, item.config) return {"text": item.text, "vector": vector.tolist(), "dim": len(vector)} diff --git a/cicd/build.sh b/cicd/build.sh index c705643..e4f7681 100755 --- a/cicd/build.sh +++ b/cicd/build.sh @@ -5,8 +5,12 @@ set -eou pipefail local_repo=${LOCAL_REPO?Variable LOCAL_REPO is required} model_name=${MODEL_NAME?Variable MODEL_NAME is required} onnx_runtime=${ONNX_RUNTIME?Variable ONNX_RUNTIME is required} +trust_remote_code=${TRUST_REMOTE_CODE:-false} +use_sentence_transformers_vectorizer=${USE_SENTENCE_TRANSFORMERS_VECTORIZER:-false} docker build \ --build-arg "MODEL_NAME=$model_name" \ --build-arg "ONNX_RUNTIME=$onnx_runtime" \ + --build-arg "TRUST_REMOTE_CODE=$trust_remote_code" \ + --build-arg "USE_SENTENCE_TRANSFORMERS_VECTORIZER=$use_sentence_transformers_vectorizer" \ -t "$local_repo" . diff --git a/cicd/docker_push.sh b/cicd/docker_push.sh index 48535c6..c44156b 100755 --- a/cicd/docker_push.sh +++ b/cicd/docker_push.sh @@ -7,6 +7,8 @@ model_name=${MODEL_NAME?Variable MODEL_NAME is required} docker_username=${DOCKER_USERNAME?Variable DOCKER_USERNAME is required} docker_password=${DOCKER_PASSWORD?Variable DOCKER_PASSWORD is required} onnx_runtime=${ONNX_RUNTIME?Variable ONNX_RUNTIME is required} +trust_remote_code=${TRUST_REMOTE_CODE:-false} +use_sentence_transformers_vectorizer=${USE_SENTENCE_TRANSFORMERS_VECTORIZER:-false} original_model_name=$model_name git_tag=$GITHUB_REF_NAME @@ -16,6 +18,7 @@ function main() { echo "git ref name is $GITHUB_REF_NAME" echo "git tag is $git_tag" echo "onnx_runtime is $onnx_runtime" + echo "trust_remote_code is $trust_remote_code" push_tag } @@ -46,6 +49,8 @@ function push_tag() { docker buildx build --platform=linux/arm64,linux/amd64 \ --build-arg "MODEL_NAME=$original_model_name" \ --build-arg "ONNX_RUNTIME=$onnx_runtime" \ + --build-arg "TRUST_REMOTE_CODE=$trust_remote_code" \ + --build-arg "USE_SENTENCE_TRANSFORMERS_VECTORIZER=$use_sentence_transformers_vectorizer" \ --push \ --tag "$tag_git" \ --tag "$tag_latest" \ diff --git a/download.py b/download.py index 0b025a6..a7458da 100755 --- a/download.py +++ b/download.py @@ -3,6 +3,7 @@ import os import sys import nltk +import json from transformers import ( AutoModel, AutoTokenizer, @@ -98,6 +99,18 @@ def quantization_config(onnx_cpu_arch: str): def download_model(model_name: str, model_dir: str, trust_remote_code: bool = False): + def save_model_name(model_name: str): + with open(f"{model_dir}/model_name", "w") as f: + f.write(model_name) + + def save_trust_remote_code(trust_remote_code: bool): + with open(f"{model_dir}/trust_remote_code", "w") as f: + f.write(f"{trust_remote_code}") + + def save_model_config(model_config): + with open(f"{model_dir}/model_config", "w") as f: + f.write(json.dumps(model_config)) + print( f"Downloading model {model_name} from huggingface model hub ({trust_remote_code=})" ) @@ -107,9 +120,11 @@ def download_model(model_name: str, model_dir: str, trust_remote_code: bool = Fa if ( model_type is not None and model_type == "t5" ) or use_sentence_transformers_vectorizer.lower() == "true": - SentenceTransformer(model_name, cache_folder=model_dir) - with open(f"{model_dir}/model_name", "w") as f: - f.write(model_name) + SentenceTransformer( + model_name, cache_folder=model_dir, trust_remote_code=trust_remote_code + ) + save_model_name(model_name) + save_model_config(config.to_dict()) else: if config.architectures and not force_automodel: print(f"Using class {config.architectures[0]} to load model weights") @@ -136,6 +151,8 @@ def download_model(model_name: str, model_dir: str, trust_remote_code: bool = Fa model.save_pretrained(model_dir) tokenizer.save_pretrained(model_dir) + save_trust_remote_code(trust_remote_code) + nltk.download("punkt", download_dir=nltk_dir) nltk.download("punkt_tab", download_dir=nltk_dir) diff --git a/meta.py b/meta.py index 1998866..cb67896 100644 --- a/meta.py +++ b/meta.py @@ -1,22 +1,25 @@ +import json +import os from transformers import AutoConfig -from config import TRUST_REMOTE_CODE - class Meta: - config: AutoConfig - def __init__( self, model_path: str, model_name: str, use_sentence_transformer_vectorizer: bool, + trust_remote_code: bool, ): if use_sentence_transformer_vectorizer: - self.config = {"model_name": model_name, "model_type": None} + if os.path.exists(f"{model_path}/model_config"): + with open(f"{model_path}/model_config", "r") as f: + self.config = json.loads(f.read()) + else: + self.config = {"model_name": model_name, "model_type": None} else: self.config = AutoConfig.from_pretrained( - model_path, trust_remote_code=TRUST_REMOTE_CODE + model_path, trust_remote_code=trust_remote_code ).to_dict() def get(self): diff --git a/smoke_test.py b/smoke_test.py index 46f57a4..f7d8f21 100755 --- a/smoke_test.py +++ b/smoke_test.py @@ -37,9 +37,15 @@ def test_meta(self): self.assertIsInstance(res.json(), dict) def test_vectorizing(self): - def try_to_vectorize(url): - print(f"url: {url}") + def get_req_body(task_type: str = ""): req_body = {"text": "The London Eye is a ferris wheel at the River Thames."} + if task_type != "": + req_body["config"] = {"task_type": task_type} + return req_body + + def try_to_vectorize(url, task_type: str = ""): + print(f"url: {url}") + req_body = get_req_body(task_type) res = requests.post(url, json=req_body) resBody = res.json() diff --git a/vectorizer.py b/vectorizer.py index b284bde..b6888d6 100644 --- a/vectorizer.py +++ b/vectorizer.py @@ -20,7 +20,6 @@ T5Tokenizer, ) -from config import TRUST_REMOTE_CODE # limit transformer batch size to limit parallel inference, otherwise we run # into memory problems @@ -29,7 +28,8 @@ class VectorInputConfig(BaseModel): - pooling_strategy: str + pooling_strategy: Optional[str] = None + task_type: Optional[str] = None class VectorInput(BaseModel): @@ -52,14 +52,15 @@ def __init__( onnx_runtime: bool, use_sentence_transformer_vectorizer: bool, model_name: str, + trust_remote_code: bool, ): self.executor = ThreadPoolExecutor() if onnx_runtime: - self.vectorizer = ONNXVectorizer(model_path) + self.vectorizer = ONNXVectorizer(model_path, trust_remote_code) else: if model_type == "t5" or use_sentence_transformer_vectorizer: self.vectorizer = SentenceTransformerVectorizer( - model_path, model_name, cuda_core + model_path, model_name, cuda_core, trust_remote_code ) else: self.vectorizer = HuggingFaceVectorizer( @@ -70,6 +71,7 @@ def __init__( model_type, architecture, direct_tokenize, + trust_remote_code, ) async def vectorize(self, text: str, config: VectorInputConfig): @@ -82,10 +84,18 @@ class SentenceTransformerVectorizer: model: SentenceTransformer cuda_core: str - def __init__(self, model_path: str, model_name: str, cuda_core: str): + def __init__( + self, model_path: str, model_name: str, cuda_core: str, trust_remote_code: bool + ): self.cuda_core = cuda_core + print( + f"model_name={model_name}, cache_folder={model_path} device:{self.get_device()} trust_remote_code:{trust_remote_code}" + ) self.model = SentenceTransformer( - model_name, cache_folder=model_path, device=self.get_device() + model_name, + cache_folder=model_path, + device=self.get_device(), + trust_remote_code=trust_remote_code, ) self.model.eval() # make sure we're in inference mode, not training @@ -108,15 +118,15 @@ class ONNXVectorizer: model: ORTModelForFeatureExtraction tokenizer: AutoTokenizer - def __init__(self, model_path) -> None: + def __init__(self, model_path, trust_remote_code: bool) -> None: onnx_path = Path(model_path) self.model = ORTModelForFeatureExtraction.from_pretrained( onnx_path, file_name="model_quantized.onnx", - trust_remote_code=TRUST_REMOTE_CODE, + trust_remote_code=trust_remote_code, ) self.tokenizer = AutoTokenizer.from_pretrained( - onnx_path, trust_remote_code=TRUST_REMOTE_CODE + onnx_path, trust_remote_code=trust_remote_code ) def mean_pooling(self, model_output, attention_mask): @@ -155,6 +165,7 @@ class HuggingFaceVectorizer: cuda_core: str model_type: str direct_tokenize: bool + trust_remote_code: bool def __init__( self, @@ -165,15 +176,17 @@ def __init__( model_type: str, architecture: str, direct_tokenize: bool, + trust_remote_code: bool, ): self.cuda = cuda_support self.cuda_core = cuda_core self.cuda_per_process_memory_fraction = cuda_per_process_memory_fraction self.model_type = model_type self.direct_tokenize = direct_tokenize + self.trust_remote_code = trust_remote_code self.model_delegate: HFModel = ModelFactory.model( - model_type, architecture, cuda_support, cuda_core + model_type, architecture, cuda_support, cuda_core, trust_remote_code ) self.model = self.model_delegate.create_model(model_path) @@ -246,22 +259,23 @@ def vectorize(self, text: str, config: VectorInputConfig): class HFModel: - def __init__(self, cuda_support: bool, cuda_core: str): + def __init__(self, cuda_support: bool, cuda_core: str, trust_remote_code: bool): super().__init__() self.model = None self.tokenizer = None self.cuda = cuda_support self.cuda_core = cuda_core + self.trust_remote_code = trust_remote_code def create_tokenizer(self, model_path): self.tokenizer = AutoTokenizer.from_pretrained( - model_path, trust_remote_code=TRUST_REMOTE_CODE + model_path, trust_remote_code=self.trust_remote_code ) return self.tokenizer def create_model(self, model_path): self.model = AutoModel.from_pretrained( - model_path, trust_remote_code=TRUST_REMOTE_CODE + model_path, trust_remote_code=self.trust_remote_code ) return self.model @@ -318,19 +332,26 @@ def pool_sum(self, embeddings, attention_mask): class DPRModel(HFModel): - def __init__(self, architecture: str, cuda_support: bool, cuda_core: str): + def __init__( + self, + architecture: str, + cuda_support: bool, + cuda_core: str, + trust_remote_code: bool, + ): super().__init__(cuda_support, cuda_core) self.model = None self.architecture = architecture + self.trust_remote_code = trust_remote_code def create_model(self, model_path): if self.architecture == "DPRQuestionEncoder": self.model = DPRQuestionEncoder.from_pretrained( - model_path, trust_remote_code=TRUST_REMOTE_CODE + model_path, trust_remote_code=self.trust_remote_code ) else: self.model = DPRContextEncoder.from_pretrained( - model_path, trust_remote_code=TRUST_REMOTE_CODE + model_path, trust_remote_code=self.trust_remote_code ) return self.model @@ -344,22 +365,23 @@ def pool_embedding(self, batch_results, tokens, config: VectorInputConfig): class T5Model(HFModel): - def __init__(self, cuda_support: bool, cuda_core: str): + def __init__(self, cuda_support: bool, cuda_core: str, trust_remote_code: bool): super().__init__(cuda_support, cuda_core) self.model = None self.tokenizer = None self.cuda = cuda_support self.cuda_core = cuda_core + self.trust_remote_code = trust_remote_code def create_model(self, model_path): self.model = T5ForConditionalGeneration.from_pretrained( - model_path, trust_remote_code=TRUST_REMOTE_CODE + model_path, trust_remote_code=self.trust_remote_code ) return self.model def create_tokenizer(self, model_path): self.tokenizer = T5Tokenizer.from_pretrained( - model_path, trust_remote_code=TRUST_REMOTE_CODE + model_path, trust_remote_code=self.trust_remote_code ) return self.tokenizer @@ -386,10 +408,16 @@ def get_batch_results(self, tokens, text): class ModelFactory: @staticmethod - def model(model_type, architecture, cuda_support: bool, cuda_core: str): + def model( + model_type, + architecture, + cuda_support: bool, + cuda_core: str, + trust_remote_code: bool, + ): if model_type == "t5": - return T5Model(cuda_support, cuda_core) + return T5Model(cuda_support, cuda_core, trust_remote_code) elif model_type == "dpr": - return DPRModel(architecture, cuda_support, cuda_core) + return DPRModel(architecture, cuda_support, cuda_core, trust_remote_code) else: - return HFModel(cuda_support, cuda_core) + return HFModel(cuda_support, cuda_core, trust_remote_code)