Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for PyTorch GPU memory limit #41

Merged
merged 11 commits into from
Jan 18, 2023
10 changes: 9 additions & 1 deletion app.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,14 @@ def startup_event():
global meta_config

cuda_env = os.getenv("ENABLE_CUDA")
cuda_per_process_memory_fraction = 1.0
if "CUDA_PER_PROCESS_MEMORY_FRACTION" in os.environ:
try:
cuda_per_process_memory_fraction = float(os.getenv("CUDA_PER_PROCESS_MEMORY_FRACTION"))
except ValueError:
logger.error(f"Invalid CUDA_PER_PROCESS_MEMORY_FRACTION (should be between 0.0-1.0)")
if 0.0 <= cuda_per_process_memory_fraction <= 1.0:
logger.info(f"CUDA_PER_PROCESS_MEMORY_FRACTION set to {cuda_per_process_memory_fraction}")
cuda_support=False
cuda_core=""

Expand All @@ -30,7 +38,7 @@ def startup_event():
logger.info("Running on CPU")

meta_config = Meta('./models/model')
vec = Vectorizer('./models/model', cuda_support, cuda_core,
vec = Vectorizer('./models/model', cuda_support, cuda_core, cuda_per_process_memory_fraction,
meta_config.getModelType(), meta_config.get_architecture())


Expand Down
5 changes: 4 additions & 1 deletion vectorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,16 +33,19 @@ class Vectorizer:
cuda_core: str
model_type: str

def __init__(self, model_path: str, cuda_support: bool, cuda_core: str, model_type: str, architecture: str):
def __init__(self, model_path: str, cuda_support: bool, cuda_core: str, cuda_per_process_memory_fraction: float, model_type: str, architecture: str):
self.cuda = cuda_support
self.cuda_core = cuda_core
self.cuda_per_process_memory_fraction = cuda_per_process_memory_fraction
self.model_type = model_type

self.model_delegate: HFModel = ModelFactory.model(model_type, architecture)
self.model = self.model_delegate.create_model(model_path)

if self.cuda:
self.model.to(self.cuda_core)
if self.cuda_per_process_memory_fraction:
torch.cuda.set_per_process_memory_fraction(self.cuda_per_process_memory_fraction)
self.model.eval() # make sure we're in inference mode, not training

self.tokenizer = self.model_delegate.create_tokenizer(model_path)
Expand Down