ShishirPatil · HuanzhiMao · Oct 9, 2024 · Oct 5, 2024 · Oct 5, 2024 · Oct 5, 2024
diff --git a/berkeley-function-call-leaderboard/bfcl/constant.py b/berkeley-function-call-leaderboard/bfcl/constant.py
@@ -54,14 +54,14 @@
         "multi_turn_miss_func",
         "multi_turn_miss_param",
         "multi_turn_long_context",
-        "multi_turn_composite",
+        # "multi_turn_composite",
     ],
     "multi_turn": [
         "multi_turn_base",
         "multi_turn_miss_func",
         "multi_turn_miss_param",
         "multi_turn_long_context",
-        "multi_turn_composite",
+        # "multi_turn_composite",
     ],
     "single_turn": [
         "exec_simple",

diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py
@@ -35,6 +35,20 @@ def multi_turn_runner(
         multi_turn_ground_truth_list: list[list[str]] = possible_answer[i]["ground_truth"]
         test_entry: dict = prompt[i]
 
+        if type(multi_turn_model_result_list) != list:
+            result.append(
+                {
+                    "id": index,
+                    "model_name": model_name,
+                    "test_category": test_category,
+                    "valid": False,
+                    "error": ["Error during inference phase. Model did not output a list of model responses."],
+                    "error_type": "multi_turn:inference_error",
+                    "prompt": test_entry,
+                    "model_result": multi_turn_model_result_list,
+                    "possible_answer": multi_turn_ground_truth_list,
+                }
+            )
         # Check if force-terminated during inference phase.
         # This happens when the model has retried too many times and still haven't figured out the answer.
         # When force-terminated, no further evaluation is needed. This whole entry will be failed.
@@ -421,7 +435,7 @@ def runner(model_names, test_categories, api_sanity_check):
     subdirs = [entry.path for entry in entries if entry.is_dir()]
 
     # Traverse each subdirectory
-    for subdir in subdirs:
+    for subdir in tqdm(subdirs, desc="Number of models evaluated"):
 
         model_name = subdir.split(INPUT_PATH)[1]
         if model_names is not None and model_name not in model_names:

diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py
@@ -338,15 +338,15 @@ def get_cost_letency_info(model_name, cost_data, latency_data):
         ) / 1000
         cost = round(cost, 2)
 
-    if model_name in OSS_LATENCY:
-        mean_latency, std_latency, percentile_95_latency = (
-            OSS_LATENCY[model_name] / 1700,
-            "N/A",
-            "N/A",
-        )
-        mean_latency = round(mean_latency, 2)
-        cost = mean_latency * 1000 * V100_x8_PRICE_PER_HOUR / 3600
-        cost = round(cost, 2)
+    # if model_name in OSS_LATENCY:
+    #     mean_latency, std_latency, percentile_95_latency = (
+    #         OSS_LATENCY[model_name] / 1700,
+    #         "N/A",
+    #         "N/A",
+    #     )
+    #     mean_latency = round(mean_latency, 2)
+    #     cost = mean_latency * 1000 * V100_x8_PRICE_PER_HOUR / 3600
+    #     cost = round(cost, 2)
 
     elif len(latency_data["data"]) != 0:
         mean_latency = statistics.mean(latency_data["data"])
@@ -612,7 +612,7 @@ def generate_leaderboard_csv(
         )
 
     # Write Non-Live Score File
-    data_non_live.sort(key=lambda x: x[1], reverse=True)
+    data_non_live.sort(key=lambda x: x[2], reverse=True)
     for i in range(len(data_non_live)):
         data_non_live[i][0] = str(i + 1)
         for j in range(2, len(data_non_live[i])):
@@ -629,7 +629,7 @@ def generate_leaderboard_csv(
                 f.write(",".join(row))
 
     # Write Live Score File
-    data_live.sort(key=lambda x: x[1], reverse=True)
+    data_live.sort(key=lambda x: x[2], reverse=True)
     for i in range(len(data_live)):
         data_live[i][0] = str(i + 1)
         for j in range(2, len(data_live[i])):

diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/model_metadata.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/model_metadata.py
@@ -1,3 +1,5 @@
+from bfcl.model_handler.handler_map import local_inference_handler_map
+
 MODEL_METADATA_MAPPING = {
     "o1-preview-2024-09-12": [
         "o1-preview-2024-09-12 (Prompt)",
@@ -203,12 +205,6 @@
         "MeetKai",
         "MIT",
     ],
-    "meetkai/functionary-small-v3.2-FC": [
-        "Functionary-Small-v3.2 (FC)",
-        "https://huggingface.co/meetkai/functionary-small-v3.2",
-        "MeetKai",
-        "MIT",
-    ],
     "meetkai/functionary-medium-v3.1-FC": [
         "Functionary-Medium-v3.1 (FC)",
         "https://huggingface.co/meetkai/functionary-medium-v3.1",
@@ -733,35 +729,15 @@
 # Because we do batching when generating the data, so the latency is not accurate from the result data.
 # This is the latency for the whole batch of data, when using 8 V100 GPUs.
 OSS_LATENCY = {
-    "deepseek-ai/deepseek-coder-6.7b-instruct": 909,
-    "google/gemma-7b-it": 95,
-    "NousResearch/Hermes-2-Pro-Mistral-7B": 135,
-    "NousResearch/Hermes-2-Pro-Llama-3-8B": 77,
-    "NousResearch/Hermes-2-Theta-Llama-3-8B": 73,
-    "NousResearch/Hermes-2-Theta-Llama-3-70B": 716,
-    "NousResearch/Hermes-2-Pro-Llama-3-70B": 674,
-    "meta-llama/Meta-Llama-3-8B-Instruct": 73,
-    "meta-llama/Meta-Llama-3-70B-Instruct": 307,
-    "gorilla-openfunctions-v2": 83,
-    "THUDM/glm-4-9b-chat": 223,
 }
 
-
-NO_COST_MODELS = [
+# All OSS models will have no cost shown on the leaderboard. 
+NO_COST_MODELS = list(local_inference_handler_map.keys()) + [
     "Nexusflow-Raven-v2",
     "firefunction-v1-FC",
     "firefunction-v2-FC",
     "meetkai/functionary-small-v3.1-FC",
-    "meetkai/functionary-small-v3.2-FC",
     "meetkai/functionary-medium-v3.1-FC",
     "snowflake/arctic",
     "nvidia/nemotron-4-340b-instruct",
-    "ibm-granite/granite-20b-functioncalling",
-    "THUDM/glm-4-9b-chat",
-    "Salesforce/xLAM-1b-fc-r",
-    "Salesforce/xLAM-7b-fc-r",
-    "Salesforce/xLAM-7b-r",
-    "Salesforce/xLAM-8x7b-r",
-    "Salesforce/xLAM-8x22b-r",
-    "Team-ACE/ToolACE-8B",
 ]
diff --git a/...all-leaderboard/bfcl/eval_checker/multi_turn_eval/func_source_code/gorilla_file_system.py b/...all-leaderboard/bfcl/eval_checker/multi_turn_eval/func_source_code/gorilla_file_system.py
@@ -49,7 +49,7 @@ def _append(self, additional_content: str) -> None:
         self.last_modified = datetime.datetime.now()
 
     def __repr__(self):
-        return f"<File: {self.name}, Last Modified: {self.last_modified}>"
+        return f"<<File: {self.name}, Last Modified: {self.last_modified}, Content: {self.content}>>"
 
     def __eq__(self, other: object) -> bool:
         if not isinstance(other, File):
@@ -121,7 +121,7 @@ def _list_contents(self) -> List[str]:
         return list(self.contents.keys())
 
     def __repr__(self):
-        return f"<Directory: {self.name}, Contents: {list(self.contents.keys())}>"
+        return f"<Directory: {self.name}, Parent: {self.parent.name if self.parent else None}, Contents: {self.contents}>"
 
     def __eq__(self, other: object) -> bool:
         if not isinstance(other, Directory):

diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/handler_map.py b/berkeley-function-call-leaderboard/bfcl/model_handler/handler_map.py
@@ -24,8 +24,9 @@
 from bfcl.model_handler.proprietary_model.yi import YiHandler
 
 # TODO: Add Deepseek V2 and Gemma V2, meta-llama/Llama-3.1-405B-Instruct
-handler_map = {
-    # Inference through API calls
+
+# Inference through API calls
+api_inference_handler_map = {
     "gorilla-openfunctions-v2": GorillaHandler,
     "o1-preview-2024-09-12": OpenAIHandler,
     "o1-mini-2024-09-12": OpenAIHandler,
@@ -68,7 +69,7 @@
     "gemini-1.5-flash-001-FC": GeminiHandler,
     "gemini-1.0-pro-002": GeminiHandler,
     "gemini-1.0-pro-002-FC": GeminiHandler,
-    "meetkai/functionary-small-v3.2-FC": FunctionaryHandler,
+    "meetkai/functionary-small-v3.1-FC": FunctionaryHandler,
     "meetkai/functionary-medium-v3.1-FC": FunctionaryHandler,
     "databricks-dbrx-instruct": DatabricksHandler,
     "command-r-plus-FC": CohereHandler,
@@ -77,8 +78,11 @@
     "command-r-plus-optimized": CohereHandler,
     "snowflake/arctic": NvidiaHandler,
     "nvidia/nemotron-4-340b-instruct": NvidiaHandler,
-    "yi-large-fc": YiHandler,
-    # Inference through local hosting
+    # "yi-large-fc": YiHandler,  #  Their API is under maintenance, and will not be back online in the near future
+}
+
+# Inference through local hosting
+local_inference_handler_map = {
     "meta-llama/Meta-Llama-3-8B-Instruct": LlamaHandler,
     "meta-llama/Meta-Llama-3-70B-Instruct": LlamaHandler,
     "meta-llama/Llama-3.1-8B-Instruct-FC": LlamaFCHandler,
@@ -114,8 +118,10 @@
     "Qwen/Qwen2.5-1.5B-Instruct": QwenHandler,
     "Qwen/Qwen2.5-7B-Instruct": QwenHandler,
     "Team-ACE/ToolACE-8B": LlamaHandler,
-
-    # Deprecated/outdated models, no longer on the leaderboard
+}
+
+# Deprecated/outdated models, no longer on the leaderboard
+outdated_model_handler_map = {
     # "gorilla-openfunctions-v0": GorillaHandler,
     # "gpt-4o-2024-05-13": OpenAIHandler,
     # "gpt-4o-2024-05-13-FC": OpenAIHandler,
@@ -135,3 +141,5 @@
     # "google/gemma-7b-it": GemmaHandler,
     # "deepseek-ai/deepseek-coder-6.7b-instruct": DeepseekHandler,
 }
+
+handler_map = {**api_inference_handler_map, **local_inference_handler_map}
diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/salesforce.py b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/salesforce.py
@@ -70,8 +70,8 @@ def xlam_json_to_python_tool_calls(tool_calls):
                     [f"{key}={repr(value)}" for key, value in arguments.items()]
                 )
                 python_format.append(f"{name}({args_str})")
-            else:
-                print(f"Invalid format: {tool_call}")
+            # else:
+                # print(f"Invalid format: {tool_call}")
 
         return python_format