From 1c392585324d618e5467e40350bae62354148544 Mon Sep 17 00:00:00 2001 From: Alex Cannan Date: Fri, 21 Apr 2023 09:36:24 -0400 Subject: [PATCH 1/2] vectorize asynchronously with ThreadPoolExecutor --- app.py | 2 +- vectorizer.py | 12 ++++++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/app.py b/app.py index 66405bd..907dd30 100644 --- a/app.py +++ b/app.py @@ -50,7 +50,7 @@ def startup_event(): @app.get("/.well-known/live", response_class=Response) @app.get("/.well-known/ready", response_class=Response) -def live_and_ready(response: Response): +async def live_and_ready(response: Response): response.status_code = status.HTTP_204_NO_CONTENT diff --git a/vectorizer.py b/vectorizer.py index 2285691..d252f55 100644 --- a/vectorizer.py +++ b/vectorizer.py @@ -1,3 +1,5 @@ +import asyncio +from concurrent.futures import ThreadPoolExecutor import math from typing import Optional import torch @@ -33,6 +35,7 @@ class Vectorizer: cuda_core: str model_type: str direct_tokenize: bool + executor: ThreadPoolExecutor def __init__(self, model_path: str, cuda_support: bool, cuda_core: str, cuda_per_process_memory_fraction: float, model_type: str, architecture: str, direct_tokenize: bool): self.cuda = cuda_support @@ -52,8 +55,10 @@ def __init__(self, model_path: str, cuda_support: bool, cuda_core: str, cuda_per self.tokenizer = self.model_delegate.create_tokenizer(model_path) + self.executor = ThreadPoolExecutor(max_workers=1) + def tokenize(self, text:str): - return self.tokenizer(text, padding=True, truncation=True, max_length=500, + return self.tokenizer(text, padding=True, truncation=True, max_length=500, add_special_tokens = True, return_tensors="pt") def get_embeddings(self, batch_results): @@ -65,7 +70,7 @@ def get_batch_results(self, tokens, text): def pool_embedding(self, batch_results, tokens, config): return self.model_delegate.pool_embedding(batch_results, tokens, config) - async def vectorize(self, text: str, config: VectorInputConfig): + def _vectorize(self, text: str, config: VectorInputConfig): with torch.no_grad(): if self.direct_tokenize: # create embeddings without tokenizing text @@ -92,6 +97,9 @@ async def vectorize(self, text: str, config: VectorInputConfig): batch_sum_vectors += self.pool_embedding(batch_results, tokens, config) return batch_sum_vectors.detach() / num_sentences + async def vectorize(self, text: str, config: VectorInputConfig): + return await asyncio.wrap_future(self.executor.submit(self._vectorize, text, config)) + class HFModel: From 4527559b00c7c441a634ab4d2a78f8b56ac05659 Mon Sep 17 00:00:00 2001 From: Alex Cannan Date: Fri, 21 Apr 2023 11:18:27 -0400 Subject: [PATCH 2/2] unbounded executor workers --- vectorizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vectorizer.py b/vectorizer.py index d252f55..50d3307 100644 --- a/vectorizer.py +++ b/vectorizer.py @@ -55,7 +55,7 @@ def __init__(self, model_path: str, cuda_support: bool, cuda_core: str, cuda_per self.tokenizer = self.model_delegate.create_tokenizer(model_path) - self.executor = ThreadPoolExecutor(max_workers=1) + self.executor = ThreadPoolExecutor() def tokenize(self, text:str): return self.tokenizer(text, padding=True, truncation=True, max_length=500,