Add support for PyTorch GPU memory limit (#41)

* Add support for PyTorch GPU memory limit * log bad CUDA memory setting as an error * Add support for sentence-t5-base model * Switch to github actions * Fixed deploy step * Fix GIT_TAG env variable in deploy step * Enable all of the transformers models * Add duplicate /vectors route to remove 307 redirects * remove un-used packages, remove installation of third party libs (#38) * remove un-used packages, remove instalation of third party libs * update requirements * more explicit variable naming for CUDA memory percentage Co-authored-by: Marcin Antas <[email protected]> Co-authored-by: John Trengrove <[email protected]> Co-authored-by: Stefan Bogdan <[email protected]>
weaviate · Jan 18, 2023 · 4a38a7b · 4a38a7b
1 parent 495dce8
commit 4a38a7b
Show file tree

Hide file tree

Showing 2 changed files with 13 additions and 2 deletions.
diff --git a/app.py b/app.py
@@ -17,6 +17,14 @@ def startup_event():
     global meta_config
 
     cuda_env = os.getenv("ENABLE_CUDA")
+    cuda_per_process_memory_fraction = 1.0
+    if "CUDA_PER_PROCESS_MEMORY_FRACTION" in os.environ:
+        try:
+            cuda_per_process_memory_fraction = float(os.getenv("CUDA_PER_PROCESS_MEMORY_FRACTION"))
+        except ValueError:
+            logger.error(f"Invalid CUDA_PER_PROCESS_MEMORY_FRACTION (should be between 0.0-1.0)")
+    if 0.0 <= cuda_per_process_memory_fraction <= 1.0:
+        logger.info(f"CUDA_PER_PROCESS_MEMORY_FRACTION set to {cuda_per_process_memory_fraction}")
     cuda_support=False
     cuda_core=""
 
@@ -30,7 +38,7 @@ def startup_event():
         logger.info("Running on CPU")
 
     meta_config = Meta('./models/model')
-    vec = Vectorizer('./models/model', cuda_support, cuda_core,
+    vec = Vectorizer('./models/model', cuda_support, cuda_core, cuda_per_process_memory_fraction,
                      meta_config.getModelType(), meta_config.get_architecture())
 
 

diff --git a/vectorizer.py b/vectorizer.py
@@ -33,16 +33,19 @@ class Vectorizer:
     cuda_core: str
     model_type: str
 
-    def __init__(self, model_path: str, cuda_support: bool, cuda_core: str, model_type: str, architecture: str):
+    def __init__(self, model_path: str, cuda_support: bool, cuda_core: str, cuda_per_process_memory_fraction: float, model_type: str, architecture: str):
         self.cuda = cuda_support
         self.cuda_core = cuda_core
+        self.cuda_per_process_memory_fraction = cuda_per_process_memory_fraction
         self.model_type = model_type
 
         self.model_delegate: HFModel = ModelFactory.model(model_type, architecture)
         self.model = self.model_delegate.create_model(model_path)
 
         if self.cuda:
             self.model.to(self.cuda_core)
+            if self.cuda_per_process_memory_fraction:
+                torch.cuda.set_per_process_memory_fraction(self.cuda_per_process_memory_fraction)
         self.model.eval() # make sure we're in inference mode, not training
 
         self.tokenizer = self.model_delegate.create_tokenizer(model_path)