-
-
Notifications
You must be signed in to change notification settings - Fork 79
Allow loading model with BitsAndBytes 4bit quantization, PEFT LoRA adapters. #203
base: master
Are you sure you want to change the base?
Changes from all commits
e4dabb0
ef138a0
8153086
eb11734
e7468b1
33a37c1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,6 +12,11 @@ | |
MinNewTokensLengthLogitsProcessor, | ||
TemperatureLogitsWarper, | ||
TopPLogitsWarper, | ||
BitsAndBytesConfig | ||
) | ||
from peft import ( | ||
PeftConfig, | ||
PeftModel | ||
) | ||
|
||
from .choice import map_choice | ||
|
@@ -309,8 +314,11 @@ def load_model( | |
name_or_path, | ||
revision=None, | ||
cache_dir=None, | ||
is_peft=False, | ||
load_in_8bit=False, | ||
load_in_4bit=False, | ||
quant_type="fp4", | ||
double_quant=False, | ||
local_files_only=False, | ||
trust_remote_code=False, | ||
half_precision=False, | ||
|
@@ -324,24 +332,46 @@ def load_model( | |
kwargs["revision"] = revision | ||
if cache_dir: | ||
kwargs["cache_dir"] = cache_dir | ||
tokenizer = AutoTokenizer.from_pretrained(name_or_path, **kwargs) | ||
|
||
# Set device mapping and quantization options if CUDA is available. | ||
if torch.cuda.is_available(): | ||
# Set quantization options if specified. | ||
quant_config = None | ||
if load_in_8bit and load_in_4bit: | ||
raise ValueError("Only one of load_in_8bit and load_in_4bit can be True") | ||
if load_in_8bit: | ||
quant_config = BitsAndBytesConfig( | ||
load_in_8bit=True, | ||
) | ||
elif load_in_4bit: | ||
quant_config = BitsAndBytesConfig( | ||
load_in_4bit=True, | ||
bnb_4bit_quant_type=quant_type, | ||
bnb_4bit_use_double_quant=double_quant, | ||
bnb_4bit_compute_dtype=torch.bfloat16, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hardcoding Reference: https://huggingface.co/blog/4bit-transformers-bitsandbytes#advanced-usage There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I agree it's a lot more configuration options. I just was not sure how much people are playing around with different options, so I put them all in! I think hardcoding
Finally, with Sorry for all the questions, I'm still trying to level up on ML code, and once again appreciate the feedback! |
||
) | ||
kwargs = kwargs.copy() | ||
kwargs["device_map"] = "auto" | ||
kwargs["load_in_8bit"] = load_in_8bit | ||
kwargs["load_in_4bit"] = load_in_4bit | ||
kwargs["quantization_config"] = quant_config | ||
|
||
# Cast all parameters to float16 if quantization is enabled. | ||
if half_precision or load_in_8bit or load_in_4bit: | ||
kwargs["torch_dtype"] = torch.float16 | ||
|
||
if is_peft: | ||
peft_config = PeftConfig.from_pretrained(name_or_path) | ||
peft_model_name_or_path = name_or_path | ||
name_or_path = peft_config.base_model_name_or_path | ||
|
||
tokenizer = AutoTokenizer.from_pretrained(name_or_path, **kwargs) | ||
peakji marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
# Support both decoder-only and encoder-decoder models. | ||
try: | ||
model = AutoModelForCausalLM.from_pretrained(name_or_path, **kwargs) | ||
except ValueError: | ||
model = AutoModelForSeq2SeqLM.from_pretrained(name_or_path, **kwargs) | ||
if is_peft: | ||
model = PeftModel.from_pretrained(model, peft_model_name_or_path, **kwargs) | ||
|
||
# Check if the model has text generation capabilities. | ||
if not model.can_generate(): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,3 +12,5 @@ safetensors~=0.3.1 | |
torch>=1.12.1 | ||
transformers[sentencepiece]~=4.30.1 | ||
waitress~=2.1.2 | ||
peft~=0.3.0 | ||
scipy~=1.10.1 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The missing dependency is a bug in bitsandbytes: bitsandbytes-foundation/bitsandbytes#426 Instead of specifying the version of the indirect dependency, I suggest waiting for bitsandbytes to fix the issue in version 0.39.0. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for pointing this out. It did feel a bit weird that I had to add this! |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It seems that
PeftModel
is not being used. Are you sure that PEFT is working correctly? (The GitHub actions environment does not have a GPU for testing)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oops 😅 well that explains why I wasn't seeming to get any results from my LoRA fine tunings.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So I added loading. But it only works with 4bit with dev version of
peft
. If loading 4bit withpeft 0.3.0
then it will error on inference.