diff --git a/app.py b/app.py index b8c20c3..bf4968f 100644 --- a/app.py +++ b/app.py @@ -44,12 +44,13 @@ def startup_event(): direct_tokenize = True model_dir = "./models/model" - def get_model_directory() -> str: + def get_model_directory() -> (str, bool): if os.path.exists(f"{model_dir}/model_name"): with open(f"{model_dir}/model_name", "r") as f: model_name = f.read() - return f"{model_dir}/{model_name}" - return model_dir + return f"{model_dir}/{model_name}", True + # Default model directory is ./models/model + return model_dir, False def get_onnx_runtime() -> bool: if os.path.exists(f"{model_dir}/onnx_runtime"): @@ -66,13 +67,14 @@ def log_info_about_onnx(onnx_runtime: bool): onnx_quantization_info = f.read() logger.info(f"Running ONNX vectorizer with quantized model for {onnx_quantization_info}") + model_dir, use_sentence_transformer_vectorizer = get_model_directory() onnx_runtime = get_onnx_runtime() log_info_about_onnx(onnx_runtime) - meta_config = Meta(get_model_directory()) - vec = Vectorizer(get_model_directory(), cuda_support, cuda_core, cuda_per_process_memory_fraction, + meta_config = Meta(model_dir) + vec = Vectorizer(model_dir, cuda_support, cuda_core, cuda_per_process_memory_fraction, meta_config.get_model_type(), meta_config.get_architecture(), - direct_tokenize, onnx_runtime) + direct_tokenize, onnx_runtime, use_sentence_transformer_vectorizer) @app.get("/.well-known/live", response_class=Response) diff --git a/download.py b/download.py index 0b54bf5..e6267de 100755 --- a/download.py +++ b/download.py @@ -34,6 +34,10 @@ if not onnx_cpu_arch: onnx_cpu_arch = "arm64" +use_sentence_transformers_vectorizer = os.getenv('USE_SENTENCE_TRANSFORMERS_VECTORIZER') +if not use_sentence_transformers_vectorizer: + use_sentence_transformers_vectorizer = "false" + print(f"Downloading MODEL_NAME={model_name} with FORCE_AUTOMODEL={force_automodel} ONNX_RUNTIME={onnx_runtime} ONNX_CPU={onnx_cpu_arch}") def download_onnx_model(model_name: str, model_dir: str): @@ -82,7 +86,7 @@ def download_model(model_name: str, model_dir: str): config = AutoConfig.from_pretrained(model_name) model_type = config.to_dict()['model_type'] - if model_type is not None and model_type == "t5": + if (model_type is not None and model_type == "t5") or use_sentence_transformers_vectorizer.lower() == "true": SentenceTransformer(model_name, cache_folder=model_dir) with open(f"{model_dir}/model_name", "w") as f: f.write(model_name.replace("/", "_")) diff --git a/vectorizer.py b/vectorizer.py index 8d13f34..6e403db 100644 --- a/vectorizer.py +++ b/vectorizer.py @@ -37,12 +37,13 @@ class VectorInput(BaseModel): class Vectorizer: executor: ThreadPoolExecutor - def __init__(self, model_path: str, cuda_support: bool, cuda_core: str, cuda_per_process_memory_fraction: float, model_type: str, architecture: str, direct_tokenize: bool, onnx_runtime: bool): + def __init__(self, model_path: str, cuda_support: bool, cuda_core: str, cuda_per_process_memory_fraction: float, + model_type: str, architecture: str, direct_tokenize: bool, onnx_runtime: bool, use_sentence_transformer_vectorizer: bool): self.executor = ThreadPoolExecutor() if onnx_runtime: self.vectorizer = ONNXVectorizer(model_path) else: - if model_type == 't5': + if model_type == 't5' or use_sentence_transformer_vectorizer: self.vectorizer = SentenceTransformerVectorizer(model_path, cuda_core) else: self.vectorizer = HuggingFaceVectorizer(model_path, cuda_support, cuda_core, cuda_per_process_memory_fraction, model_type, architecture, direct_tokenize)