Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[BFCL] Chore: Various Improvements and Adjustments #673

Merged
merged 9 commits into from
Oct 9, 2024
4 changes: 2 additions & 2 deletions berkeley-function-call-leaderboard/bfcl/constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,14 +54,14 @@
"multi_turn_miss_func",
"multi_turn_miss_param",
"multi_turn_long_context",
"multi_turn_composite",
# "multi_turn_composite",
HuanzhiMao marked this conversation as resolved.
Show resolved Hide resolved
],
"multi_turn": [
"multi_turn_base",
"multi_turn_miss_func",
"multi_turn_miss_param",
"multi_turn_long_context",
"multi_turn_composite",
# "multi_turn_composite",
],
"single_turn": [
"exec_simple",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,20 @@ def multi_turn_runner(
multi_turn_ground_truth_list: list[list[str]] = possible_answer[i]["ground_truth"]
test_entry: dict = prompt[i]

if type(multi_turn_model_result_list) != list:
result.append(
{
"id": index,
"model_name": model_name,
"test_category": test_category,
"valid": False,
"error": ["Error during inference phase. Model did not output a list of model responses."],
"error_type": "multi_turn:inference_error",
"prompt": test_entry,
"model_result": multi_turn_model_result_list,
"possible_answer": multi_turn_ground_truth_list,
}
)
# Check if force-terminated during inference phase.
# This happens when the model has retried too many times and still haven't figured out the answer.
# When force-terminated, no further evaluation is needed. This whole entry will be failed.
Expand Down Expand Up @@ -421,7 +435,7 @@ def runner(model_names, test_categories, api_sanity_check):
subdirs = [entry.path for entry in entries if entry.is_dir()]

# Traverse each subdirectory
for subdir in subdirs:
for subdir in tqdm(subdirs, desc="Number of models evaluated"):

model_name = subdir.split(INPUT_PATH)[1]
if model_names is not None and model_name not in model_names:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -338,15 +338,15 @@ def get_cost_letency_info(model_name, cost_data, latency_data):
) / 1000
cost = round(cost, 2)

if model_name in OSS_LATENCY:
mean_latency, std_latency, percentile_95_latency = (
OSS_LATENCY[model_name] / 1700,
"N/A",
"N/A",
)
mean_latency = round(mean_latency, 2)
cost = mean_latency * 1000 * V100_x8_PRICE_PER_HOUR / 3600
cost = round(cost, 2)
# if model_name in OSS_LATENCY:
# mean_latency, std_latency, percentile_95_latency = (
# OSS_LATENCY[model_name] / 1700,
# "N/A",
# "N/A",
# )
# mean_latency = round(mean_latency, 2)
# cost = mean_latency * 1000 * V100_x8_PRICE_PER_HOUR / 3600
# cost = round(cost, 2)

elif len(latency_data["data"]) != 0:
mean_latency = statistics.mean(latency_data["data"])
Expand Down Expand Up @@ -612,7 +612,7 @@ def generate_leaderboard_csv(
)

# Write Non-Live Score File
data_non_live.sort(key=lambda x: x[1], reverse=True)
data_non_live.sort(key=lambda x: x[2], reverse=True)
for i in range(len(data_non_live)):
data_non_live[i][0] = str(i + 1)
for j in range(2, len(data_non_live[i])):
Expand All @@ -629,7 +629,7 @@ def generate_leaderboard_csv(
f.write(",".join(row))

# Write Live Score File
data_live.sort(key=lambda x: x[1], reverse=True)
data_live.sort(key=lambda x: x[2], reverse=True)
for i in range(len(data_live)):
data_live[i][0] = str(i + 1)
for j in range(2, len(data_live[i])):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from bfcl.model_handler.handler_map import local_inference_handler_map

MODEL_METADATA_MAPPING = {
"o1-preview-2024-09-12": [
"o1-preview-2024-09-12 (Prompt)",
Expand Down Expand Up @@ -203,12 +205,6 @@
"MeetKai",
"MIT",
],
"meetkai/functionary-small-v3.2-FC": [
"Functionary-Small-v3.2 (FC)",
"https://huggingface.co/meetkai/functionary-small-v3.2",
"MeetKai",
"MIT",
],
"meetkai/functionary-medium-v3.1-FC": [
"Functionary-Medium-v3.1 (FC)",
"https://huggingface.co/meetkai/functionary-medium-v3.1",
Expand Down Expand Up @@ -733,35 +729,15 @@
# Because we do batching when generating the data, so the latency is not accurate from the result data.
# This is the latency for the whole batch of data, when using 8 V100 GPUs.
OSS_LATENCY = {
"deepseek-ai/deepseek-coder-6.7b-instruct": 909,
"google/gemma-7b-it": 95,
"NousResearch/Hermes-2-Pro-Mistral-7B": 135,
"NousResearch/Hermes-2-Pro-Llama-3-8B": 77,
"NousResearch/Hermes-2-Theta-Llama-3-8B": 73,
"NousResearch/Hermes-2-Theta-Llama-3-70B": 716,
"NousResearch/Hermes-2-Pro-Llama-3-70B": 674,
"meta-llama/Meta-Llama-3-8B-Instruct": 73,
"meta-llama/Meta-Llama-3-70B-Instruct": 307,
"gorilla-openfunctions-v2": 83,
"THUDM/glm-4-9b-chat": 223,
}


NO_COST_MODELS = [
# All OSS models will have no cost shown on the leaderboard.
NO_COST_MODELS = list(local_inference_handler_map.keys()) + [
"Nexusflow-Raven-v2",
"firefunction-v1-FC",
"firefunction-v2-FC",
"meetkai/functionary-small-v3.1-FC",
"meetkai/functionary-small-v3.2-FC",
"meetkai/functionary-medium-v3.1-FC",
"snowflake/arctic",
"nvidia/nemotron-4-340b-instruct",
"ibm-granite/granite-20b-functioncalling",
"THUDM/glm-4-9b-chat",
"Salesforce/xLAM-1b-fc-r",
"Salesforce/xLAM-7b-fc-r",
"Salesforce/xLAM-7b-r",
"Salesforce/xLAM-8x7b-r",
"Salesforce/xLAM-8x22b-r",
"Team-ACE/ToolACE-8B",
]
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def _append(self, additional_content: str) -> None:
self.last_modified = datetime.datetime.now()

def __repr__(self):
return f"<File: {self.name}, Last Modified: {self.last_modified}>"
return f"<<File: {self.name}, Last Modified: {self.last_modified}, Content: {self.content}>>"

def __eq__(self, other: object) -> bool:
if not isinstance(other, File):
Expand Down Expand Up @@ -121,7 +121,7 @@ def _list_contents(self) -> List[str]:
return list(self.contents.keys())

def __repr__(self):
return f"<Directory: {self.name}, Contents: {list(self.contents.keys())}>"
return f"<Directory: {self.name}, Parent: {self.parent.name if self.parent else None}, Contents: {self.contents}>"

def __eq__(self, other: object) -> bool:
if not isinstance(other, Directory):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,9 @@
from bfcl.model_handler.proprietary_model.yi import YiHandler

# TODO: Add Deepseek V2 and Gemma V2, meta-llama/Llama-3.1-405B-Instruct
handler_map = {
# Inference through API calls

# Inference through API calls
api_inference_handler_map = {
"gorilla-openfunctions-v2": GorillaHandler,
"o1-preview-2024-09-12": OpenAIHandler,
"o1-mini-2024-09-12": OpenAIHandler,
Expand Down Expand Up @@ -68,7 +69,7 @@
"gemini-1.5-flash-001-FC": GeminiHandler,
"gemini-1.0-pro-002": GeminiHandler,
"gemini-1.0-pro-002-FC": GeminiHandler,
"meetkai/functionary-small-v3.2-FC": FunctionaryHandler,
"meetkai/functionary-small-v3.1-FC": FunctionaryHandler,
"meetkai/functionary-medium-v3.1-FC": FunctionaryHandler,
"databricks-dbrx-instruct": DatabricksHandler,
"command-r-plus-FC": CohereHandler,
Expand All @@ -77,8 +78,11 @@
"command-r-plus-optimized": CohereHandler,
"snowflake/arctic": NvidiaHandler,
"nvidia/nemotron-4-340b-instruct": NvidiaHandler,
"yi-large-fc": YiHandler,
# Inference through local hosting
# "yi-large-fc": YiHandler, # Their API is under maintenance, and will not be back online in the near future
}

# Inference through local hosting
local_inference_handler_map = {
"meta-llama/Meta-Llama-3-8B-Instruct": LlamaHandler,
"meta-llama/Meta-Llama-3-70B-Instruct": LlamaHandler,
"meta-llama/Llama-3.1-8B-Instruct-FC": LlamaFCHandler,
Expand Down Expand Up @@ -114,8 +118,10 @@
"Qwen/Qwen2.5-1.5B-Instruct": QwenHandler,
"Qwen/Qwen2.5-7B-Instruct": QwenHandler,
"Team-ACE/ToolACE-8B": LlamaHandler,

# Deprecated/outdated models, no longer on the leaderboard
}

# Deprecated/outdated models, no longer on the leaderboard
outdated_model_handler_map = {
# "gorilla-openfunctions-v0": GorillaHandler,
# "gpt-4o-2024-05-13": OpenAIHandler,
# "gpt-4o-2024-05-13-FC": OpenAIHandler,
Expand All @@ -135,3 +141,5 @@
# "google/gemma-7b-it": GemmaHandler,
# "deepseek-ai/deepseek-coder-6.7b-instruct": DeepseekHandler,
}

handler_map = {**api_inference_handler_map, **local_inference_handler_map}
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,8 @@ def xlam_json_to_python_tool_calls(tool_calls):
[f"{key}={repr(value)}" for key, value in arguments.items()]
)
python_format.append(f"{name}({args_str})")
else:
print(f"Invalid format: {tool_call}")
# else:
# print(f"Invalid format: {tool_call}")

return python_format

Expand Down