weaviate · StefanBogdan · Jan 18, 2023 · Jan 3, 2023 · Jan 5, 2023 · Jan 13, 2023
diff --git a/app.py b/app.py
@@ -17,6 +17,14 @@ def startup_event():
     global meta_config
 
     cuda_env = os.getenv("ENABLE_CUDA")
+    cuda_per_process_memory_fraction = 1.0
+    if "CUDA_PER_PROCESS_MEMORY_FRACTION" in os.environ:
+        try:
+            cuda_per_process_memory_fraction = float(os.getenv("CUDA_PER_PROCESS_MEMORY_FRACTION"))
+        except ValueError:
+            logger.error(f"Invalid CUDA_PER_PROCESS_MEMORY_FRACTION (should be between 0.0-1.0)")
+    if 0.0 <= cuda_per_process_memory_fraction <= 1.0:
+        logger.info(f"CUDA_PER_PROCESS_MEMORY_FRACTION set to {cuda_per_process_memory_fraction}")
     cuda_support=False
     cuda_core=""
 
@@ -30,7 +38,7 @@ def startup_event():
         logger.info("Running on CPU")
 
     meta_config = Meta('./models/model')
-    vec = Vectorizer('./models/model', cuda_support, cuda_core,
+    vec = Vectorizer('./models/model', cuda_support, cuda_core, cuda_per_process_memory_fraction,
                      meta_config.getModelType(), meta_config.get_architecture())
 
 

diff --git a/vectorizer.py b/vectorizer.py
@@ -33,16 +33,19 @@ class Vectorizer:
     cuda_core: str
     model_type: str
 
-    def __init__(self, model_path: str, cuda_support: bool, cuda_core: str, model_type: str, architecture: str):
+    def __init__(self, model_path: str, cuda_support: bool, cuda_core: str, cuda_per_process_memory_fraction: float, model_type: str, architecture: str):
         self.cuda = cuda_support
         self.cuda_core = cuda_core
+        self.cuda_per_process_memory_fraction = cuda_per_process_memory_fraction
         self.model_type = model_type
 
         self.model_delegate: HFModel = ModelFactory.model(model_type, architecture)
         self.model = self.model_delegate.create_model(model_path)
 
         if self.cuda:
             self.model.to(self.cuda_core)
+            if self.cuda_per_process_memory_fraction:
+                torch.cuda.set_per_process_memory_fraction(self.cuda_per_process_memory_fraction)
         self.model.eval() # make sure we're in inference mode, not training
 
         self.tokenizer = self.model_delegate.create_tokenizer(model_path)