From 1c392585324d618e5467e40350bae62354148544 Mon Sep 17 00:00:00 2001
From: Alex Cannan <alexfcannan@gmail.com>
Date: Fri, 21 Apr 2023 09:36:24 -0400
Subject: [PATCH 1/2] vectorize asynchronously with ThreadPoolExecutor

---
 app.py        |  2 +-
 vectorizer.py | 12 ++++++++++--
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/app.py b/app.py
index 66405bd..907dd30 100644
--- a/app.py
+++ b/app.py
@@ -50,7 +50,7 @@ def startup_event():
 
 @app.get("/.well-known/live", response_class=Response)
 @app.get("/.well-known/ready", response_class=Response)
-def live_and_ready(response: Response):
+async def live_and_ready(response: Response):
     response.status_code = status.HTTP_204_NO_CONTENT
 
 
diff --git a/vectorizer.py b/vectorizer.py
index 2285691..d252f55 100644
--- a/vectorizer.py
+++ b/vectorizer.py
@@ -1,3 +1,5 @@
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
 import math
 from typing import Optional
 import torch
@@ -33,6 +35,7 @@ class Vectorizer:
     cuda_core: str
     model_type: str
     direct_tokenize: bool
+    executor: ThreadPoolExecutor
 
     def __init__(self, model_path: str, cuda_support: bool, cuda_core: str, cuda_per_process_memory_fraction: float, model_type: str, architecture: str, direct_tokenize: bool):
         self.cuda = cuda_support
@@ -52,8 +55,10 @@ def __init__(self, model_path: str, cuda_support: bool, cuda_core: str, cuda_per
 
         self.tokenizer = self.model_delegate.create_tokenizer(model_path)
 
+        self.executor = ThreadPoolExecutor(max_workers=1)
+
     def tokenize(self, text:str):
-        return self.tokenizer(text, padding=True, truncation=True, max_length=500, 
+        return self.tokenizer(text, padding=True, truncation=True, max_length=500,
                 add_special_tokens = True, return_tensors="pt")
 
     def get_embeddings(self, batch_results):
@@ -65,7 +70,7 @@ def get_batch_results(self, tokens, text):
     def pool_embedding(self, batch_results, tokens, config):
         return self.model_delegate.pool_embedding(batch_results, tokens, config)
 
-    async def vectorize(self, text: str, config: VectorInputConfig):
+    def _vectorize(self, text: str, config: VectorInputConfig):
         with torch.no_grad():
             if self.direct_tokenize:
                 # create embeddings without tokenizing text
@@ -92,6 +97,9 @@ async def vectorize(self, text: str, config: VectorInputConfig):
                     batch_sum_vectors += self.pool_embedding(batch_results, tokens, config)
                 return batch_sum_vectors.detach() / num_sentences
 
+    async def vectorize(self, text: str, config: VectorInputConfig):
+        return await asyncio.wrap_future(self.executor.submit(self._vectorize, text, config))
+
 
 class HFModel:
 

From 4527559b00c7c441a634ab4d2a78f8b56ac05659 Mon Sep 17 00:00:00 2001
From: Alex Cannan <alexfcannan@gmail.com>
Date: Fri, 21 Apr 2023 11:18:27 -0400
Subject: [PATCH 2/2] unbounded executor workers

---
 vectorizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vectorizer.py b/vectorizer.py
index d252f55..50d3307 100644
--- a/vectorizer.py
+++ b/vectorizer.py
@@ -55,7 +55,7 @@ def __init__(self, model_path: str, cuda_support: bool, cuda_core: str, cuda_per
 
         self.tokenizer = self.model_delegate.create_tokenizer(model_path)
 
-        self.executor = ThreadPoolExecutor(max_workers=1)
+        self.executor = ThreadPoolExecutor()
 
     def tokenize(self, text:str):
         return self.tokenizer(text, padding=True, truncation=True, max_length=500,