From e4dabb0996afac23e5fe6bcedf136456dea5b320 Mon Sep 17 00:00:00 2001 From: Sam Coward Date: Sun, 4 Jun 2023 18:56:15 -0400 Subject: [PATCH 1/6] Allow loading model with 4bit quantization. For detail on 4bit options, see: https://huggingface.co/blog/4bit-transformers-bitsandbytes --- basaran/__init__.py | 2 ++ basaran/__main__.py | 4 ++++ basaran/model.py | 21 +++++++++++++++++++-- 3 files changed, 25 insertions(+), 2 deletions(-) diff --git a/basaran/__init__.py b/basaran/__init__.py index 28f20bd3..6b40c429 100644 --- a/basaran/__init__.py +++ b/basaran/__init__.py @@ -22,6 +22,8 @@ def is_true(value): MODEL_CACHE_DIR = os.getenv("MODEL_CACHE_DIR", "models") MODEL_LOAD_IN_8BIT = is_true(os.getenv("MODEL_LOAD_IN_8BIT", "")) MODEL_LOAD_IN_4BIT = is_true(os.getenv("MODEL_LOAD_IN_4BIT", "")) +MODEL_4BIT_QUANT_TYPE = os.getenv("MODEL_4BIT_QUANT_TYPE", "fp4") +MODEL_4BIT_DOUBLE_QUANT = is_true(os.getenv("MODEL_4BIT_DOUBLE_QUANT", "")) MODEL_LOCAL_FILES_ONLY = is_true(os.getenv("MODEL_LOCAL_FILES_ONLY", "")) MODEL_TRUST_REMOTE_CODE = is_true(os.getenv("MODEL_TRUST_REMOTE_CODE", "")) MODEL_HALF_PRECISION = is_true(os.getenv("MODEL_HALF_PRECISION", "")) diff --git a/basaran/__main__.py b/basaran/__main__.py index bcd123b2..504f160b 100644 --- a/basaran/__main__.py +++ b/basaran/__main__.py @@ -21,6 +21,8 @@ from . import MODEL_CACHE_DIR from . import MODEL_LOAD_IN_8BIT from . import MODEL_LOAD_IN_4BIT +from . import MODEL_4BIT_QUANT_TYPE +from . import MODEL_4BIT_DOUBLE_QUANT from . import MODEL_LOCAL_FILES_ONLY from . import MODEL_TRUST_REMOTE_CODE from . import MODEL_HALF_PRECISION @@ -44,6 +46,8 @@ cache_dir=MODEL_CACHE_DIR, load_in_8bit=MODEL_LOAD_IN_8BIT, load_in_4bit=MODEL_LOAD_IN_4BIT, + quant_type=MODEL_4BIT_QUANT_TYPE, + double_quant=MODEL_4BIT_DOUBLE_QUANT, local_files_only=MODEL_LOCAL_FILES_ONLY, trust_remote_code=MODEL_TRUST_REMOTE_CODE, half_precision=MODEL_HALF_PRECISION, diff --git a/basaran/model.py b/basaran/model.py index c1bda2d7..37b96d5b 100644 --- a/basaran/model.py +++ b/basaran/model.py @@ -12,6 +12,7 @@ MinNewTokensLengthLogitsProcessor, TemperatureLogitsWarper, TopPLogitsWarper, + BitsAndBytesConfig ) from .choice import map_choice @@ -311,6 +312,8 @@ def load_model( cache_dir=None, load_in_8bit=False, load_in_4bit=False, + quant_type="fp4", + double_quant=False, local_files_only=False, trust_remote_code=False, half_precision=False, @@ -328,10 +331,24 @@ def load_model( # Set device mapping and quantization options if CUDA is available. if torch.cuda.is_available(): + # Set quantization options if specified. + quant_config = None + if load_in_8bit and load_in_4bit: + raise ValueError("Only one of load_in_8bit and load_in_4bit can be True") + if load_in_8bit: + quant_config = BitsAndBytesConfig( + load_in_8bit=True, + ) + elif load_in_4bit: + quant_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type=quant_type, + bnb_4bit_use_double_quant=double_quant, + bnb_4bit_compute_dtype=torch.bfloat16, + ) kwargs = kwargs.copy() kwargs["device_map"] = "auto" - kwargs["load_in_8bit"] = load_in_8bit - kwargs["load_in_4bit"] = load_in_4bit + kwargs["quantization_config"] = quant_config # Cast all parameters to float16 if quantization is enabled. if half_precision or load_in_8bit or load_in_4bit: From ef138a056bf43246027d091977b73f37b92a2710 Mon Sep 17 00:00:00 2001 From: Sam Coward Date: Sun, 4 Jun 2023 19:43:02 -0400 Subject: [PATCH 2/6] Support loading PEFT (LoRA) models --- basaran/__init__.py | 1 + basaran/__main__.py | 2 ++ basaran/model.py | 12 +++++++++++- requirements.txt | 1 + 4 files changed, 15 insertions(+), 1 deletion(-) diff --git a/basaran/__init__.py b/basaran/__init__.py index 6b40c429..c5317d01 100644 --- a/basaran/__init__.py +++ b/basaran/__init__.py @@ -18,6 +18,7 @@ def is_true(value): PORT = int(os.getenv("PORT", "80")) # Model-related arguments: +MODEL_PEFT = is_true(os.getenv("MODEL_PEFT", "")) MODEL_REVISION = os.getenv("MODEL_REVISION", "") MODEL_CACHE_DIR = os.getenv("MODEL_CACHE_DIR", "models") MODEL_LOAD_IN_8BIT = is_true(os.getenv("MODEL_LOAD_IN_8BIT", "")) diff --git a/basaran/__main__.py b/basaran/__main__.py index 504f160b..5adac250 100644 --- a/basaran/__main__.py +++ b/basaran/__main__.py @@ -23,6 +23,7 @@ from . import MODEL_LOAD_IN_4BIT from . import MODEL_4BIT_QUANT_TYPE from . import MODEL_4BIT_DOUBLE_QUANT +from . import MODEL_PEFT from . import MODEL_LOCAL_FILES_ONLY from . import MODEL_TRUST_REMOTE_CODE from . import MODEL_HALF_PRECISION @@ -44,6 +45,7 @@ name_or_path=MODEL, revision=MODEL_REVISION, cache_dir=MODEL_CACHE_DIR, + is_peft=MODEL_PEFT, load_in_8bit=MODEL_LOAD_IN_8BIT, load_in_4bit=MODEL_LOAD_IN_4BIT, quant_type=MODEL_4BIT_QUANT_TYPE, diff --git a/basaran/model.py b/basaran/model.py index 37b96d5b..64fab3f3 100644 --- a/basaran/model.py +++ b/basaran/model.py @@ -14,6 +14,10 @@ TopPLogitsWarper, BitsAndBytesConfig ) +from peft import ( + PeftConfig, + PeftModel +) from .choice import map_choice from .tokenizer import StreamTokenizer @@ -310,6 +314,7 @@ def load_model( name_or_path, revision=None, cache_dir=None, + is_peft=False, load_in_8bit=False, load_in_4bit=False, quant_type="fp4", @@ -327,7 +332,6 @@ def load_model( kwargs["revision"] = revision if cache_dir: kwargs["cache_dir"] = cache_dir - tokenizer = AutoTokenizer.from_pretrained(name_or_path, **kwargs) # Set device mapping and quantization options if CUDA is available. if torch.cuda.is_available(): @@ -354,6 +358,12 @@ def load_model( if half_precision or load_in_8bit or load_in_4bit: kwargs["torch_dtype"] = torch.float16 + if is_peft: + peft_config = PeftConfig.from_pretrained(name_or_path) + name_or_path = peft_config.base_model_name_or_path + + tokenizer = AutoTokenizer.from_pretrained(name_or_path, **kwargs) + # Support both decoder-only and encoder-decoder models. try: model = AutoModelForCausalLM.from_pretrained(name_or_path, **kwargs) diff --git a/requirements.txt b/requirements.txt index f36f38c0..f60f369c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,3 +12,4 @@ safetensors~=0.3.1 torch>=1.12.1 transformers[sentencepiece]~=4.30.1 waitress~=2.1.2 +peft~=0.3.0 \ No newline at end of file From 8153086dc2cd8680049e5cd9782904f5e215f1be Mon Sep 17 00:00:00 2001 From: Sam Coward Date: Wed, 14 Jun 2023 21:20:57 -0400 Subject: [PATCH 3/6] Add missing deps --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index f60f369c..841a1005 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,4 +12,5 @@ safetensors~=0.3.1 torch>=1.12.1 transformers[sentencepiece]~=4.30.1 waitress~=2.1.2 -peft~=0.3.0 \ No newline at end of file +peft~=0.3.0 +scipy~=1.10.1 \ No newline at end of file From eb11734c1e7b870ea3a61c82a198f742c0407d0e Mon Sep 17 00:00:00 2001 From: Sam Coward Date: Fri, 16 Jun 2023 19:37:01 -0400 Subject: [PATCH 4/6] Actually create PEFT model if config supplied --- basaran/model.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/basaran/model.py b/basaran/model.py index 64fab3f3..91490192 100644 --- a/basaran/model.py +++ b/basaran/model.py @@ -16,7 +16,8 @@ ) from peft import ( PeftConfig, - PeftModel + PeftModelForCausalLM, + PeftModelForSeq2SeqLM ) from .choice import map_choice @@ -367,8 +368,12 @@ def load_model( # Support both decoder-only and encoder-decoder models. try: model = AutoModelForCausalLM.from_pretrained(name_or_path, **kwargs) + if is_peft: + model = PeftModelForCausalLM(model, peft_config) except ValueError: model = AutoModelForSeq2SeqLM.from_pretrained(name_or_path, **kwargs) + if is_peft: + model = PeftModelForSeq2SeqLM(model, peft_config) # Check if the model has text generation capabilities. if not model.can_generate(): From e7468b150f2f5458a24dd4a4661e7cde40c0cfca Mon Sep 17 00:00:00 2001 From: Sam Coward Date: Fri, 16 Jun 2023 20:42:17 -0400 Subject: [PATCH 5/6] Another attempt at loading PeftModel --- basaran/model.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/basaran/model.py b/basaran/model.py index 91490192..5274d99a 100644 --- a/basaran/model.py +++ b/basaran/model.py @@ -16,8 +16,7 @@ ) from peft import ( PeftConfig, - PeftModelForCausalLM, - PeftModelForSeq2SeqLM + PeftModel ) from .choice import map_choice @@ -361,6 +360,7 @@ def load_model( if is_peft: peft_config = PeftConfig.from_pretrained(name_or_path) + peft_model_name_or_path = name_or_path name_or_path = peft_config.base_model_name_or_path tokenizer = AutoTokenizer.from_pretrained(name_or_path, **kwargs) @@ -368,12 +368,10 @@ def load_model( # Support both decoder-only and encoder-decoder models. try: model = AutoModelForCausalLM.from_pretrained(name_or_path, **kwargs) - if is_peft: - model = PeftModelForCausalLM(model, peft_config) except ValueError: model = AutoModelForSeq2SeqLM.from_pretrained(name_or_path, **kwargs) - if is_peft: - model = PeftModelForSeq2SeqLM(model, peft_config) + if is_peft: + model = PeftModel.from_pretrained(model, peft_model_name_or_path) # Check if the model has text generation capabilities. if not model.can_generate(): From 33a37c1508dde064cdd1e3fb33456f24fd5b3d59 Mon Sep 17 00:00:00 2001 From: Sam Coward Date: Sat, 17 Jun 2023 01:05:34 -0400 Subject: [PATCH 6/6] Pass kwargs to PeftModel loading --- basaran/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/basaran/model.py b/basaran/model.py index 5274d99a..62a52c9b 100644 --- a/basaran/model.py +++ b/basaran/model.py @@ -371,7 +371,7 @@ def load_model( except ValueError: model = AutoModelForSeq2SeqLM.from_pretrained(name_or_path, **kwargs) if is_peft: - model = PeftModel.from_pretrained(model, peft_model_name_or_path) + model = PeftModel.from_pretrained(model, peft_model_name_or_path, **kwargs) # Check if the model has text generation capabilities. if not model.can_generate():