Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[BFCL] Patch Generation Script for Locally Hosted OSS model #537

Merged
merged 13 commits into from
Jul 24, 2024
3 changes: 2 additions & 1 deletion berkeley-function-call-leaderboard/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,7 @@ Some companies have proposed some optimization strategies in their models' handl
## Changelog

* [July 21, 2024] [#538](https://github.com/ShishirPatil/gorilla/pull/538): Fix `language_specific_pre_processing` function to properly handle pre-processing for prompts and function docs in Java and JavaScript test categories. All entries in these categories are affected.
* [July 20, 2024] [#537](https://github.com/ShishirPatil/gorilla/pull/537): Update generation script for locally-hosted OSS model to use single-node multi-GPU inference method (tensor parallel). Ray is not used anymore.
* [July 16, 2024] [#525](https://github.com/ShishirPatil/gorilla/pull/525), [#536](https://github.com/ShishirPatil/gorilla/pull/536): Add new model `ibm-granite/granite-20b-functioncalling` to the leaderboard.
* [July 10, 2024] [#522](https://github.com/ShishirPatil/gorilla/pull/522): Bug fix in the evaluation dataset for Executable Parallel Multiple category. This includes updates to both prompts and function docs. 2 entries are affected.
* [July 8, 2024] [#516](https://github.com/ShishirPatil/gorilla/pull/516): Fix double-casting issue in `model_handler` for Java and JavaScript test categories.
Expand Down Expand Up @@ -252,7 +253,7 @@ Some companies have proposed some optimization strategies in their models' handl
* [April 3, 2024] [#309](https://github.com/ShishirPatil/gorilla/pull/309): Bug fix for evaluation dataset possible answers. Implement **string standardization** for the AST evaluation pipeline, i.e. removing white spaces and a subset of punctuations (`,./-_*^`) to make the AST evaluation more robust and accurate. Fixed AST evaluation issue for type `tuple`. Add 2 new models `meetkai/functionary-small-v2.4 (FC)`, `meetkai/functionary-medium-v2.4 (FC)` to the leaderboard.
* [April 1, 2024] [#299](https://github.com/ShishirPatil/gorilla/pull/299): Leaderboard update with new models (`Claude-3-Haiku`, `Databrick-DBRX-Instruct`), more advanced AST evaluation procedure, and updated evaluation datasets. Cost and latency statistics during evaluation are also measured. We also released the manual that our evaluation procedure is based on, available [here](https://gorilla.cs.berkeley.edu/blogs/8_berkeley_function_calling_leaderboard.html#metrics).
* [Mar 11, 2024] [#254](https://github.com/ShishirPatil/gorilla/pull/254): Leaderboard update with 3 new models: `Claude-3-Opus-20240229 (Prompt)`, `Claude-3-Sonnet-20240229 (Prompt)`, and `meetkai/functionary-medium-v2.2 (FC)`
* [Mar 5, 2024] [#237](https://github.com/ShishirPatil/gorilla/pull/237) and [238](https://github.com/ShishirPatil/gorilla/pull/238): leaderboard update resulting from [#223](https://github.com/ShishirPatil/gorilla/pull/223); 3 new models: `mistral-large-2402`, `gemini-1.0-pro`, and `gemma`.
* [Mar 5, 2024] [#237](https://github.com/ShishirPatil/gorilla/pull/237) and [238](https://github.com/ShishirPatil/gorilla/pull/238): leaderboard update resulting from [#223](https://github.com/ShishirPatil/gorilla/pull/223); 3 new models: `mistral-large-2402`, `gemini-1.0-pro`, and `google/gemma-7b-it`.
* [Feb 29, 2024] [#223](https://github.com/ShishirPatil/gorilla/pull/223): modifications to REST evaluation.


Expand Down
97 changes: 18 additions & 79 deletions berkeley-function-call-leaderboard/model_handler/glm_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,10 @@
class GLMHandler(OSSHandler):
def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None:
super().__init__(model_name, temperature, top_p, max_tokens)
self.tensor_parallel_size = 8
self.max_model_len=4096
self.stop_token_ids = [151329, 151336, 151338]


def apply_chat_template(self, prompt, function, test_category):
oai_tool = convert_to_tool(
function, GORILLA_TO_OPENAPI, ModelStyle.OpenAI, test_category, True
Expand All @@ -32,94 +34,31 @@ def apply_chat_template(self, prompt, function, test_category):
conversation, tokenize=False, add_generation_prompt=True
)

def _batch_generate(
self,
question_jsons,
test_category,
model_path,
temperature,
max_tokens,
top_p,
index,
llm,
):
from vllm import SamplingParams

prompts = []
ans_jsons = []
for line in question_jsons:
for key, value in FILENAME_INDEX_MAPPING.items():
start, end = value
if index >= start and index < end:
test_category = key
break
prompts.append(line)
ans_id = shortuuid.uuid()
ans_jsons.append(
{
"answer_id": ans_id,
"question": line,
}
)

print("start generating: ", len(prompts))
stop_token_ids = [151329, 151336, 151338]
sampling_params = SamplingParams(
temperature=temperature,
max_tokens=max_tokens,
top_p=top_p,
stop_token_ids=stop_token_ids,
)
outputs = llm.generate(prompts, sampling_params)
final_ans_jsons = []
for output, ans_json in zip(outputs, ans_jsons):
text = output.outputs[0].text
ans_json["text"] = text
final_ans_jsons.append(ans_json)
return final_ans_jsons


def inference(self, test_question, test_category, num_gpus):
from transformers import AutoTokenizer
self.tokenizer = AutoTokenizer.from_pretrained(
self.model_name, trust_remote_code=True
)

chat_template_ques_jsons = []
for line in test_question:
prompt = augment_prompt_by_languge(line["question"], test_category)
function = language_specific_pre_processing(
line["function"], test_category
)
chat_template_ques_jsons.append(
self.apply_chat_template(prompt, function, test_category)
)

chunk_size = len(test_question) // num_gpus
from vllm import LLM

llm = LLM(
model=self.model_name,
dtype="float16",
trust_remote_code=True,
tensor_parallel_size=self.tensor_parallel_size,
max_model_len=4096,
test_question = self.process_input(
test_question, test_category, self.apply_chat_template
)

ans_jsons = self._batch_generate(
test_question=test_question,
model_path=self.model_name,
temperature=self.temperature,
max_tokens=self.max_tokens,
top_p=self.top_p,
stop_token_ids=self.stop_token_ids,
max_model_len=self.max_model_len,
num_gpus=num_gpus,
)
ans_jsons = []
for i in range(0, len(test_question), chunk_size):
output = self._batch_generate(
chat_template_ques_jsons[i : i + chunk_size],
test_category,
self.model_name,
self.temperature,
self.max_tokens,
self.top_p,
i,
llm,
)
ans_jsons.extend(output)

return ans_jsons, {"input_tokens": 0, "output_tokens": 0, "latency": 0}


def decode_ast(self, result, language="Python"):
args = result.split("\n")
if len(args) == 1:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,6 @@
"gemini-1.5-pro-preview-0409": GeminiHandler,
"gemini-1.5-pro-preview-0514": GeminiHandler,
"gemini-1.5-flash-preview-0514": GeminiHandler,
"gemma": OSSHandler,
"google/gemma-7b-it": GemmaHandler,
"glaiveai/glaive-function-calling-v1": GlaiveHandler,
"deepseek-ai/deepseek-coder-6.7b-instruct": DeepseekHandler,
Expand Down
113 changes: 54 additions & 59 deletions berkeley-function-call-leaderboard/model_handler/oss_handler.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
import json
import os

import ray
import shortuuid
import torch
from eval_checker.eval_checker_constant import FILENAME_INDEX_MAPPING
from model_handler.handler import BaseHandler
from model_handler.model_style import ModelStyle
Expand All @@ -13,14 +11,11 @@
language_specific_pre_processing,
)


class OSSHandler(BaseHandler):
def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None:
super().__init__(model_name, temperature, top_p, max_tokens)
self.model_style = ModelStyle.OSSMODEL
self._init_model()

def _init_model(self):
ray.init(ignore_reinit_error=True, num_cpus=8)

def _format_prompt(prompt, function, test_category):
SYSTEM_PROMPT = """
Expand All @@ -34,78 +29,78 @@ def _format_prompt(prompt, function, test_category):
functions += "\n" + str(function)
return f"SYSTEM: {SYSTEM_PROMPT}\n{functions}\nUSER: {prompt}\nASSISTANT: "

@ray.remote(num_gpus=1)
@torch.inference_mode()
@staticmethod
def _batch_generate(
question_jsons,
test_category,
test_question,
model_path,
temperature,
max_tokens,
top_p,
format_prompt_func,
index,
stop_token_ids=None,
max_model_len=None,
num_gpus=1,
):
from vllm import LLM, SamplingParams

print("start generating, test question length: ", len(test_question))

sampling_params = SamplingParams(
temperature=temperature,
max_tokens=max_tokens,
top_p=top_p,
stop_token_ids=stop_token_ids,
)
llm = LLM(
model=model_path,
dtype="float16",
trust_remote_code=True,
disable_custom_all_reduce=True,
max_model_len=max_model_len,
tensor_parallel_size=num_gpus,
)
outputs = llm.generate(test_question, sampling_params)

final_ans_jsons = []
for output in outputs:
text = output.outputs[0].text
final_ans_jsons.append(text)
return final_ans_jsons

@staticmethod
def process_input(test_question, test_category, format_prompt_func):
prompts = []
ans_jsons = []
for line in question_jsons:
for key, value in FILENAME_INDEX_MAPPING.items():
start, end = value
if index >= start and index < end:
test_category = key
break
ques_json = line
for ques_json in test_question:
prompt = augment_prompt_by_languge(ques_json["question"], test_category)
functions = language_specific_pre_processing(
ques_json["function"], test_category
)
prompts.append(format_prompt_func(prompt, functions, test_category))
ans_id = shortuuid.uuid()
ans_jsons.append(
{
"answer_id": ans_id,
"question": ques_json["question"],
}
)

print("start generating: ", len(prompts))
sampling_params = SamplingParams(
temperature=temperature, max_tokens=max_tokens, top_p=top_p
)
llm = LLM(model=model_path, dtype="float16", trust_remote_code=True)
outputs = llm.generate(prompts, sampling_params)
final_ans_jsons = []
for output, ans_json in zip(outputs, ans_jsons):
text = output.outputs[0].text
ans_json["text"] = text
final_ans_jsons.append(ans_json)
return final_ans_jsons
return prompts

def inference(
self, test_question, test_category, num_gpus, format_prompt_func=_format_prompt
self,
test_question,
test_category,
num_gpus,
format_prompt_func=_format_prompt,
stop_token_ids=None,
max_model_len=None,
):
test_question = self.process_input(
test_question, test_category, format_prompt_func
)


chunk_size = len(test_question) // num_gpus
ans_handles = []
for i in range(0, len(test_question), chunk_size):
ans_handles.append(
self._batch_generate.remote(
test_question[i : i + chunk_size],
test_category,
self.model_name,
self.temperature,
self.max_tokens,
self.top_p,
format_prompt_func,
i,
)
)
ans_jsons = []
for ans_handle in ans_handles:
ans_jsons.extend(ray.get(ans_handle))
ans_jsons = self._batch_generate(
test_question=test_question,
model_path=self.model_name,
temperature=self.temperature,
max_tokens=self.max_tokens,
top_p=self.top_p,
stop_token_ids=stop_token_ids,
max_model_len=max_model_len,
num_gpus=num_gpus,
)

return ans_jsons, {"input_tokens": 0, "output_tokens": 0, "latency": 0}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,13 +90,13 @@ def load_file(test_categories):
num_existing_result += 1

if handler.model_style == ModelStyle.OSSMODEL:
result = handler.inference(
result, metadata = handler.inference(
test_question = test_cases[num_existing_result:],
test_category = test_category,
num_gpus = args.num_gpus,
)
for index, res in enumerate(result[0]):
result_to_write = {"id": index, "result": res["text"]}
for index, res in enumerate(result):
result_to_write = {"id": index, "result": res}
handler.write(result_to_write, file_to_open)
else:
for index, test_case in enumerate(tqdm(test_cases)):
Expand Down