Skip to content

Commit

Permalink
Merge pull request #47 from iMountTai/main
Browse files Browse the repository at this point in the history
add support for 4bit inference
  • Loading branch information
ymcui authored Aug 2, 2023
2 parents 2e0aca6 + d66efde commit 7b3e5ab
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 11 deletions.
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@ torch==2.0.1
git+https://github.com/huggingface/peft.git@13e53fc
transformers==4.31.0
sentencepiece==0.1.97
bitsandbytes==0.39.1
bitsandbytes==0.41.0
21 changes: 16 additions & 5 deletions scripts/inference/gradio_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
LlamaForCausalLM,
LlamaTokenizer,
StoppingCriteria,
BitsAndBytesConfig
)
import gradio as gr
import argparse
Expand Down Expand Up @@ -57,7 +58,11 @@
parser.add_argument(
'--load_in_8bit',
action='store_true',
help='Use 8 bit quantified model')
help='Use 8 bit quantized model')
parser.add_argument(
'--load_in_4bit',
action='store_true',
help='Use 4 bit quantized model')
parser.add_argument(
'--only_cpu',
action='store_true',
Expand Down Expand Up @@ -90,7 +95,10 @@
args = parser.parse_args()
if args.only_cpu is True:
args.gpus = ""

if args.load_in_8bit or args.load_in_4bit:
raise ValueError("Quantization is unavailable on CPU.")
if args.load_in_8bit and args.load_in_4bit:
raise ValueError("Only one quantization method can be chosen for inference. Please check your arguments")
import sys
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(parent_dir)
Expand Down Expand Up @@ -118,7 +126,7 @@ def setup():

if args.lora_model is not None:
raise ValueError("vLLM currently does not support LoRA, please merge the LoRA weights to the base model.")
if args.load_in_8bit:
if args.load_in_8bit or args.load_in_4bit:
raise ValueError("vLLM currently does not support quantization, please use fp16 (default) or unuse --use_vllm.")
if args.only_cpu:
raise ValueError("vLLM requires GPUs with compute capability not less than 7.0. If you want to run only on CPU, please unuse --use_vllm.")
Expand All @@ -141,7 +149,6 @@ def setup():
max_memory = args.max_memory
port = args.port
share = args.share
load_in_8bit = args.load_in_8bit
load_type = torch.float16
if torch.cuda.is_available():
device = torch.device(0)
Expand All @@ -155,10 +162,14 @@ def setup():

base_model = LlamaForCausalLM.from_pretrained(
args.base_model,
load_in_8bit=load_in_8bit,
torch_dtype=load_type,
low_cpu_mem_usage=True,
device_map='auto',
quantization_config=BitsAndBytesConfig(
load_in_4bit=args.load_in_4bit,
load_in_8bit=args.load_in_8bit,
bnb_4bit_compute_dtype=load_type
)
)

model_vocab_size = base_model.get_input_embeddings().weight.size(0)
Expand Down
14 changes: 12 additions & 2 deletions scripts/inference/inference_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,22 +22,28 @@
parser.add_argument('--only_cpu',action='store_true',help='only use CPU for inference')
parser.add_argument('--alpha',type=str,default="1.0", help="The scaling factor of NTK method, can be a float or 'auto'. ")
parser.add_argument('--load_in_8bit',action='store_true', help="Load the LLM in the 8bit mode")
parser.add_argument('--load_in_4bit',action='store_true', help="Load the LLM in the 4bit mode")
parser.add_argument("--use_vllm", action='store_true', help="Use vLLM as back-end LLM service.")
parser.add_argument('--system_prompt',type=str,default=DEFAULT_SYSTEM_PROMPT, help="The system prompt of the prompt template.")
args = parser.parse_args()
if args.use_vllm:
if args.lora_model is not None:
raise ValueError("vLLM currently does not support LoRA, please merge the LoRA weights to the base model.")
if args.load_in_8bit:
if args.load_in_8bit or args.load_in_4bit:
raise ValueError("vLLM currently does not support quantization, please use fp16 (default) or unuse --use_vllm.")
if args.only_cpu:
raise ValueError("vLLM requires GPUs with compute capability not less than 7.0. If you want to run only on CPU, please unuse --use_vllm.")
if args.load_in_8bit and args.load_in_4bit:
raise ValueError("Only one quantization method can be chosen for inference. Please check your arguments")
if args.only_cpu is True:
args.gpus = ""
if args.load_in_8bit or args.load_in_4bit:
raise ValueError("Quantization is unavailable on CPU.")
os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus
import torch
from transformers import LlamaForCausalLM, LlamaTokenizer
from transformers import GenerationConfig
from transformers import BitsAndBytesConfig
from peft import PeftModel
if args.use_vllm:
from vllm import LLM, SamplingParams
Expand Down Expand Up @@ -96,10 +102,14 @@ def generate_prompt(instruction):

base_model = LlamaForCausalLM.from_pretrained(
args.base_model,
load_in_8bit=args.load_in_8bit,
torch_dtype=load_type,
low_cpu_mem_usage=True,
device_map='auto',
quantization_config=BitsAndBytesConfig(
load_in_4bit=args.load_in_4bit,
load_in_8bit=args.load_in_8bit,
bnb_4bit_compute_dtype=load_type
)
)

model_vocab_size = base_model.get_input_embeddings().weight.size(0)
Expand Down
14 changes: 11 additions & 3 deletions scripts/openai_server_demo/openai_api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,21 @@
parser.add_argument('--tokenizer_path',default=None,type=str)
parser.add_argument('--gpus', default="0", type=str)
parser.add_argument('--load_in_8bit',action='store_true', help='Load the model in 8bit mode')
parser.add_argument('--load_in_4bit',action='store_true', help='Load the model in 4bit mode')
parser.add_argument('--only_cpu',action='store_true',help='Only use CPU for inference')
parser.add_argument('--alpha',type=str,default="1.0", help="The scaling factor of NTK method, can be a float or 'auto'. ")
args = parser.parse_args()
load_in_8bit = args.load_in_8bit
if args.only_cpu is True:
args.gpus = ""
if args.load_in_8bit or args.load_in_4bit:
raise ValueError("Quantization is unavailable on CPU.")
if args.load_in_8bit and args.load_in_4bit:
raise ValueError("Only one quantization method can be chosen for inference. Please check your arguments")
os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus

import torch
import torch.nn.functional as F
from transformers import LlamaForCausalLM, LlamaTokenizer, GenerationConfig
from transformers import LlamaForCausalLM, LlamaTokenizer, GenerationConfig, BitsAndBytesConfig
from peft import PeftModel

import sys
Expand Down Expand Up @@ -54,10 +58,14 @@

base_model = LlamaForCausalLM.from_pretrained(
args.base_model,
load_in_8bit=load_in_8bit,
torch_dtype=load_type,
low_cpu_mem_usage=True,
device_map='auto' if not args.only_cpu else None,
quantization_config=BitsAndBytesConfig(
load_in_4bit=args.load_in_4bit,
load_in_8bit=args.load_in_8bit,
bnb_4bit_compute_dtype=load_type
)
)

model_vocab_size = base_model.get_input_embeddings().weight.size(0)
Expand Down

0 comments on commit 7b3e5ab

Please sign in to comment.