Skip to content

Commit

Permalink
[BFCL] Support Category-Specific Generation for OSS Model, Remove eva…
Browse files Browse the repository at this point in the history
…l_data_compilation Step (#512)

Currently, for OSS models, users must run `eval_data_compilation.py` to
merge all datasets into a single `data_total.json` file before running
inference. This requires inferring on all datasets at once, without the
option to run inference on individual datasets or subsets. This PR
addresses this limitation by allowing users to perform inference on
specific datasets directly, removing the need for the
`eval_data_compilation` step.
Note: hosted models don't have this limitation. 

Partially addresses #501 and #502.
  • Loading branch information
HuanzhiMao authored Jul 17, 2024
1 parent 7bef000 commit 951c728
Show file tree
Hide file tree
Showing 10 changed files with 49 additions and 94 deletions.
5 changes: 1 addition & 4 deletions berkeley-function-call-leaderboard/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,7 @@ The `apply_function_credential_config.py` takes an input and optionally an outpu
python apply_function_credential_config.py --input-file data/gorilla_openfunctions_v1_test_rest.json
```

Then, use `eval_data_compilation.py` to compile all files by

```bash
python eval_data_compilation.py
```
## Berkeley Function-Calling Leaderboard Statistics


Expand Down Expand Up @@ -209,6 +205,7 @@ For inferencing `Databrick-DBRX-instruct`, you need to create a Databrick Azure

## Changelog

* [July 7, 2024] [#504](https://github.com/ShishirPatil/gorilla/pull/504), [#505](https://github.com/ShishirPatil/gorilla/pull/505), [#506](https://github.com/ShishirPatil/gorilla/pull/506), [#508](https://github.com/ShishirPatil/gorilla/pull/508), [#510](https://github.com/ShishirPatil/gorilla/pull/510), [#512](https://github.com/ShishirPatil/gorilla/pull/512), [#517](https://github.com/ShishirPatil/gorilla/pull/517): Make BFCL user-friendly and easy to extend.
* [July 6, 2024] [#423](https://github.com/ShishirPatil/gorilla/pull/423) and [#503](https://github.com/ShishirPatil/gorilla/pull/503): Bug fix in possible answers for the AST evaluation dataset (parallel category: 14 affected; parallel_multiple category: 25 affected).
* [July 5, 2024] [#496](https://github.com/ShishirPatil/gorilla/pull/496): Updates to API status checks. Checking the health of executable APIs is now off by default. Further, even when triggered, un-healthy APIs will not terminate the evaluation process. Users can enable this feature by setting the `--api-sanity-check` flag or `-c` for short. The previous `--skip-api-sanity-check` or `-s` flag is now deprecated.
* [July 3, 2024] [#489](https://github.com/ShishirPatil/gorilla/pull/489): Add new model `nvidia/nemotron-4-340b-instruct` to the leaderboard.
Expand Down
37 changes: 0 additions & 37 deletions berkeley-function-call-leaderboard/eval_data_compilation.py

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@ def _format_prompt(prompt, function, test_category):
return formatted_prompt.format(function=function, prompt=prompt)

def inference(
self, question_file, test_category, num_gpus, fromat_prompt_func=_format_prompt
self, test_question, test_category, num_gpus, fromat_prompt_func=_format_prompt
):
return super().inference(
question_file, test_category, num_gpus, fromat_prompt_func
test_question, test_category, num_gpus, fromat_prompt_func
)

def decode_ast(self, result, language="Python"):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@ def _format_prompt(prompt, function, test_category):
return formatted_prompt.format(function=function, prompt=prompt)

def inference(
self, question_file, test_category, num_gpus, fromat_prompt_func=_format_prompt
self, test_question, test_category, num_gpus, fromat_prompt_func=_format_prompt
):
return super().inference(
question_file, test_category, num_gpus, fromat_prompt_func
test_question, test_category, num_gpus, fromat_prompt_func
)

def decode_ast(self, result, language="Python"):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@ def _format_prompt(prompt, function, test_category):
return formatted_prompt.format(function=function, prompt=prompt)

def inference(
self, question_file, test_category, num_gpus, fromat_prompt_func=_format_prompt
self, test_question, test_category, num_gpus, fromat_prompt_func=_format_prompt
):
return super().inference(
question_file, test_category, num_gpus, fromat_prompt_func
test_question, test_category, num_gpus, fromat_prompt_func
)

def decode_ast(self, result, language="Python"):
Expand Down
12 changes: 4 additions & 8 deletions berkeley-function-call-leaderboard/model_handler/glm_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,18 +78,14 @@ def _batch_generate(
final_ans_jsons.append(ans_json)
return final_ans_jsons

def inference(self, question_file, test_category, num_gpus):
def inference(self, test_question, test_category, num_gpus):
from transformers import AutoTokenizer
self.tokenizer = AutoTokenizer.from_pretrained(
self.model_name, trust_remote_code=True
)

ques_jsons = []
with open(question_file, "r") as ques_file:
for line in ques_file:
ques_jsons.append(json.loads(line))
chat_template_ques_jsons = []
for line in ques_jsons:
for line in test_question:
prompt = augment_prompt_by_languge(line["question"], test_category)
function = language_specific_pre_processing(
line["function"], test_category, False
Expand All @@ -98,7 +94,7 @@ def inference(self, question_file, test_category, num_gpus):
self.apply_chat_template(prompt, function, test_category)
)

chunk_size = len(ques_jsons) // num_gpus
chunk_size = len(test_question) // num_gpus
from vllm import LLM

llm = LLM(
Expand All @@ -109,7 +105,7 @@ def inference(self, question_file, test_category, num_gpus):
max_model_len=4096,
)
ans_jsons = []
for i in range(0, len(ques_jsons), chunk_size):
for i in range(0, len(test_question), chunk_size):
output = self._batch_generate(
chat_template_ques_jsons[i : i + chunk_size],
test_category,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,10 @@ def _format_prompt(prompt, function, test_category):
)

def inference(
self, question_file, test_category, num_gpus, format_prompt_func=_format_prompt
self, test_question, test_category, num_gpus, format_prompt_func=_format_prompt
):
return super().inference(
question_file, test_category, num_gpus, format_prompt_func
test_question, test_category, num_gpus, format_prompt_func
)

def decode_ast(self, result, language="Python"):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@ def _format_prompt(prompt, function, test_category):
return conversations

def inference(
self, question_file, test_category, num_gpus, format_prompt_func=_format_prompt
self, test_question, test_category, num_gpus, format_prompt_func=_format_prompt
):
return super().inference(
question_file, test_category, num_gpus, format_prompt_func
test_question, test_category, num_gpus, format_prompt_func
)

def decode_ast(self, result, language="Python"):
Expand Down
12 changes: 4 additions & 8 deletions berkeley-function-call-leaderboard/model_handler/oss_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,20 +84,16 @@ def _batch_generate(
return final_ans_jsons

def inference(
self, question_file, test_category, num_gpus, format_prompt_func=_format_prompt
self, test_question, test_category, num_gpus, format_prompt_func=_format_prompt
):

ques_jsons = []
with open(question_file, "r") as ques_file:
for line in ques_file:
ques_jsons.append(json.loads(line))

chunk_size = len(ques_jsons) // num_gpus
chunk_size = len(test_question) // num_gpus
ans_handles = []
for i in range(0, len(ques_jsons), chunk_size):
for i in range(0, len(test_question), chunk_size):
ans_handles.append(
self._batch_generate.remote(
ques_jsons[i : i + chunk_size],
test_question[i : i + chunk_size],
test_category,
self.model_name,
self.temperature,
Expand Down
57 changes: 30 additions & 27 deletions berkeley-function-call-leaderboard/openfunctions_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,37 +60,40 @@ def load_file(test_category):
if USE_COHERE_OPTIMIZATION and "command-r-plus" in args.model:
args.model = args.model + "-optimized"
handler = build_handler(args.model, args.temperature, args.top_p, args.max_tokens)
if handler.model_style == ModelStyle.OSSMODEL:
result = handler.inference(
question_file="eval_data_total.json",
test_category=args.test_category,
num_gpus=args.num_gpus,
)
for res in result[0]:
handler.write(res, "result.json")
else:
test_cate, files_to_open = load_file(args.test_category)
for test_category, file_to_open in zip(test_cate, files_to_open):
print("Generating: " + file_to_open)
test_cases = []
with open("./data/" + file_to_open) as f:
for line in f:
test_cases.append(json.loads(line))
num_existing_result = 0 # if the result file already exists, skip the test cases that have been tested.
if os.path.exists(

test_cate, files_to_open = load_file(args.test_category)
for test_category, file_to_open in zip(test_cate, files_to_open):
print("Generating: " + file_to_open)
test_cases = []
with open("./data/" + file_to_open) as f:
for line in f:
test_cases.append(json.loads(line))
num_existing_result = 0 # if the result file already exists, skip the test cases that have been tested.
if os.path.exists(
"./result/"
+ args.model.replace("/", "_")
+ "/"
+ file_to_open.replace(".json", "_result.json")
):
with open(
"./result/"
+ args.model.replace("/", "_")
+ "/"
+ file_to_open.replace(".json", "_result.json")
):
with open(
"./result/"
+ args.model.replace("/", "_")
+ "/"
+ file_to_open.replace(".json", "_result.json")
) as f:
for line in f:
num_existing_result += 1
) as f:
for line in f:
num_existing_result += 1

if handler.model_style == ModelStyle.OSSMODEL:
result = handler.inference(
test_question = test_cases[num_existing_result:],
test_category = test_category,
num_gpus = args.num_gpus,
)
for index, res in enumerate(result[0]):
result_to_write = {"id": index, "result": res["text"]}
handler.write(result_to_write, file_to_open)
else:
for index, test_case in enumerate(tqdm(test_cases)):
if index < num_existing_result:
continue
Expand Down

0 comments on commit 951c728

Please sign in to comment.