From f973d9702123cf68bc84dc382dca66ab52f33992 Mon Sep 17 00:00:00 2001 From: "Huanzhi (Hans) Mao" <huanzhimao@gmail.com> Date: Sat, 9 Nov 2024 02:47:42 -0800 Subject: [PATCH 1/7] update handler map --- .../bfcl/model_handler/handler_map.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/handler_map.py b/berkeley-function-call-leaderboard/bfcl/model_handler/handler_map.py index eae669af0..f886426aa 100644 --- a/berkeley-function-call-leaderboard/bfcl/model_handler/handler_map.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/handler_map.py @@ -43,10 +43,14 @@ "claude-3-opus-20240229-FC": ClaudeHandler, "claude-3-sonnet-20240229": ClaudeHandler, "claude-3-sonnet-20240229-FC": ClaudeHandler, - "claude-3-haiku-20240307": ClaudeHandler, - "claude-3-haiku-20240307-FC": ClaudeHandler, "claude-3-5-sonnet-20240620": ClaudeHandler, "claude-3-5-sonnet-20240620-FC": ClaudeHandler, + "claude-3-5-sonnet-20241022": ClaudeHandler, + "claude-3-5-sonnet-20241022-FC": ClaudeHandler, + "claude-3-haiku-20240307": ClaudeHandler, + "claude-3-haiku-20240307-FC": ClaudeHandler, + "claude-3-5-haiku-20241022": ClaudeHandler, + "claude-3-5-haiku-20241022-FC": ClaudeHandler, "open-mistral-nemo-2407": MistralHandler, "open-mistral-nemo-2407-FC": MistralHandler, "open-mixtral-8x22b": MistralHandler, From 7d2c36e703361e93d635a1fcced17f5226449d0e Mon Sep 17 00:00:00 2001 From: "Huanzhi (Hans) Mao" <huanzhimao@gmail.com> Date: Sat, 9 Nov 2024 02:51:32 -0800 Subject: [PATCH 2/7] update model metadata --- .../bfcl/eval_checker/model_metadata.py | 36 +++++++++++++++++-- .../bfcl/model_handler/constant.py | 4 ++- 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/model_metadata.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/model_metadata.py index 8a17ded10..03fa53684 100644 --- a/berkeley-function-call-leaderboard/bfcl/eval_checker/model_metadata.py +++ b/berkeley-function-call-leaderboard/bfcl/eval_checker/model_metadata.py @@ -175,6 +175,18 @@ "Anthropic", "Proprietary", ], + "claude-3-5-haiku-20241022-FC": [ + "claude-3.5-haiku-20241022 (FC tools-2024-04-04)", + "https://www.anthropic.com/news/3-5-models-and-computer-use", + "Anthropic", + "Proprietary", + ], + "claude-3-5-haiku-20241022": [ + "claude-3.5-haiku-20241022 (Prompt)", + "https://www.anthropic.com/news/3-5-models-and-computer-use", + "Anthropic", + "Proprietary", + ], "claude-3-5-sonnet-20240620-FC": [ "Claude-3.5-Sonnet-20240620 (FC)", "https://www.anthropic.com/news/claude-3-5-sonnet", @@ -187,6 +199,18 @@ "Anthropic", "Proprietary", ], + "claude-3-5-sonnet-20241022-FC": [ + "Claude-3.5-Sonnet-20241022 (FC)", + "https://www.anthropic.com/news/3-5-models-and-computer-use", + "Anthropic", + "Proprietary", + ], + "claude-3-5-sonnet-20241022": [ + "Claude-3.5-Sonnet-20241022 (Prompt)", + "https://www.anthropic.com/news/3-5-models-and-computer-use", + "Anthropic", + "Proprietary", + ], "gpt-3.5-turbo-0125-FC": [ "GPT-3.5-Turbo-0125 (FC)", "https://platform.openai.com/docs/models/gpt-3-5-turbo", @@ -656,10 +680,14 @@ "claude-3-opus-20240229": 15, "claude-3-sonnet-20240229-FC": 3, "claude-3-sonnet-20240229": 3, - "claude-3-haiku-20240307-FC": 0.25, - "claude-3-haiku-20240307": 0.25, "claude-3-5-sonnet-20240620-FC": 3, "claude-3-5-sonnet-20240620": 3, + "claude-3-5-sonnet-20241022-FC": 3, + "claude-3-5-sonnet-20241022": 3, + "claude-3-haiku-20240307-FC": 0.25, + "claude-3-haiku-20240307": 0.25, + "claude-3-5-haiku-20241022-FC": 1, + "claude-3-5-haiku-20241022": 1, "claude-2.1": 8, "claude-instant-1.2": 0.8, "open-mistral-nemo-2407": 0.3, @@ -716,8 +744,12 @@ "claude-3-sonnet-20240229": 15, "claude-3-5-sonnet-20240620-FC": 15, "claude-3-5-sonnet-20240620": 15, + "claude-3-5-sonnet-20241022-FC": 15, + "claude-3-5-sonnet-20241022": 15, "claude-3-haiku-20240307-FC": 1.25, "claude-3-haiku-20240307": 1.25, + "claude-3-5-haiku-20241022-FC": 5, + "claude-3-5-haiku-20241022": 5, "claude-2.1": 24, "claude-instant-1.2": 2.4, "open-mistral-nemo-2407": 0.3, diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/constant.py b/berkeley-function-call-leaderboard/bfcl/model_handler/constant.py index a27ca49da..6ad37cd33 100644 --- a/berkeley-function-call-leaderboard/bfcl/model_handler/constant.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/constant.py @@ -122,8 +122,10 @@ "gpt-3.5-turbo-0125-FC", "claude-3-opus-20240229-FC", "claude-3-sonnet-20240229-FC", - "claude-3-haiku-20240307-FC", "claude-3-5-sonnet-20240620-FC", + "claude-3-5-sonnet-20241022-FC", + "claude-3-haiku-20240307-FC", + "claude-3-5-haiku-20241022-FC", "open-mistral-nemo-2407-FC", "open-mixtral-8x22b-FC", "mistral-large-2407-FC", From 987c1d23f0d4acb117807bfc7df2d91ab07e6257 Mon Sep 17 00:00:00 2001 From: "Huanzhi (Hans) Mao" <huanzhimao@gmail.com> Date: Sat, 9 Nov 2024 02:54:45 -0800 Subject: [PATCH 3/7] update max token limit for new claude 3.5 --- .../bfcl/model_handler/proprietary_model/claude.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/claude.py b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/claude.py index 27ec930f6..62e5f0850 100644 --- a/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/claude.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/claude.py @@ -78,7 +78,7 @@ def _query_FC(self, inference_data: dict): return self.client.messages.create( model=self.model_name.strip("-FC"), max_tokens=( - 8192 if "claude-3-5-sonnet-20240620" in self.model_name else 4096 + 8192 if "claude-3-5" in self.model_name else 4096 ), # 3.5 Sonnet has a higher max token limit tools=inference_data["tools"], messages=inference_data["message"], From 059bd5e38cdb01f1997089af3907b2aab96f7704 Mon Sep 17 00:00:00 2001 From: "Huanzhi (Hans) Mao" <huanzhimao@gmail.com> Date: Sat, 9 Nov 2024 02:54:52 -0800 Subject: [PATCH 4/7] update change log --- berkeley-function-call-leaderboard/CHANGELOG.md | 5 +++++ berkeley-function-call-leaderboard/README.md | 6 ++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/berkeley-function-call-leaderboard/CHANGELOG.md b/berkeley-function-call-leaderboard/CHANGELOG.md index fed6560e5..d7a659e18 100644 --- a/berkeley-function-call-leaderboard/CHANGELOG.md +++ b/berkeley-function-call-leaderboard/CHANGELOG.md @@ -2,6 +2,11 @@ All notable changes to the Berkeley Function Calling Leaderboard will be documented in this file. +- [Nov 9, 2024] [#750](https://github.com/ShishirPatil/gorilla/pull/750): Add the following new models to the leaderboard: + - `claude-3-5-haiku-20241022` + - `claude-3-5-haiku-20241022-FC` + - `claude-3-5-sonnet-20241022` + - `claude-3-5-sonnet-20241022-FC` - [Oct 30, 2024] [#725](https://github.com/ShishirPatil/gorilla/pull/725), [#733](https://github.com/ShishirPatil/gorilla/pull/733): Update evaluation metric for multi-turn categories: - Introduce a new response-based checker, which works alongside with the existing state-based checker. - The new checker compares the model’s execution result against the ground truth execution result, ensuring that the model’s result encompasses the ground truth (i.e., ground truth must be a strict subset of the model result). diff --git a/berkeley-function-call-leaderboard/README.md b/berkeley-function-call-leaderboard/README.md index 4acbf926a..86c2d8e72 100644 --- a/berkeley-function-call-leaderboard/README.md +++ b/berkeley-function-call-leaderboard/README.md @@ -146,8 +146,10 @@ Below is _a table of models we support_ to run our leaderboard evaluation agains |gorilla-openfunctions-v2 | Function Calling| |claude-3-{opus-20240229,sonnet-20240229,haiku-20240307}-FC | Function Calling | |claude-3-{opus-20240229,sonnet-20240229,haiku-20240307} | Prompt | -|claude-3-5-sonnet-20240620-FC | Function Calling | -|claude-3-5-sonnet-20240620 | Prompt | +|claude-3-5-sonnet-{20240620,20241022}-FC | Function Calling | +|claude-3-5-sonnet-{20240620,20241022} | Prompt | +|claude-3-5-haiku-20241022-FC | Function Calling | +|claude-3-5-haiku-20241022 | Prompt | |claude-{2.1,instant-1.2}| Prompt| |command-r-plus-FC | Function Calling| |command-r-plus | Prompt| From 7307b5c03072db26c3ead71a5bb2657ee8136b61 Mon Sep 17 00:00:00 2001 From: "Huanzhi (Hans) Mao" <huanzhimao@gmail.com> Date: Tue, 12 Nov 2024 17:19:24 -0800 Subject: [PATCH 5/7] bump anthropic pypi version --- berkeley-function-call-leaderboard/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/berkeley-function-call-leaderboard/pyproject.toml b/berkeley-function-call-leaderboard/pyproject.toml index e29f9acb2..81330701a 100644 --- a/berkeley-function-call-leaderboard/pyproject.toml +++ b/berkeley-function-call-leaderboard/pyproject.toml @@ -24,7 +24,7 @@ dependencies = [ "tree-sitter-javascript==0.21.4", "openai==1.46.0", "mistralai==1.1.0", - "anthropic==0.37.1", + "anthropic==0.39.0", "cohere==5.5.8", "typer>=0.12.5", "tabulate>=0.9.0", From 9827f4986a5ec04b28c2e295a4a6e8fb66520aae Mon Sep 17 00:00:00 2001 From: "Huanzhi (Hans) Mao" <huanzhimao@gmail.com> Date: Mon, 18 Nov 2024 20:49:40 -0800 Subject: [PATCH 6/7] supress redundant console logging --- .../bfcl/model_handler/proprietary_model/gemini.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/gemini.py b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/gemini.py index 805fa0112..98b331539 100644 --- a/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/gemini.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/gemini.py @@ -1,4 +1,3 @@ -import logging import os import vertexai @@ -29,7 +28,6 @@ Tool, ) -logging.basicConfig(level=logging.INFO) class GeminiHandler(BaseHandler): def __init__(self, model_name, temperature) -> None: From daba0288afce29a7c80c0b9da07de491350b5cb2 Mon Sep 17 00:00:00 2001 From: "Huanzhi (Hans) Mao" <huanzhimao@gmail.com> Date: Mon, 18 Nov 2024 22:32:49 -0800 Subject: [PATCH 7/7] update model metadata --- .../bfcl/eval_checker/model_metadata.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/model_metadata.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/model_metadata.py index 84c801dc3..65d604042 100644 --- a/berkeley-function-call-leaderboard/bfcl/eval_checker/model_metadata.py +++ b/berkeley-function-call-leaderboard/bfcl/eval_checker/model_metadata.py @@ -92,7 +92,7 @@ "Apache 2.0", ], "claude-3-opus-20240229-FC": [ - "Claude-3-Opus-20240229 (FC tools-2024-04-04)", + "Claude-3-Opus-20240229 (FC)", "https://www.anthropic.com/news/claude-3-family", "Anthropic", "Proprietary", @@ -152,7 +152,7 @@ "Proprietary", ], "claude-3-sonnet-20240229-FC": [ - "Claude-3-Sonnet-20240229 (FC tools-2024-04-04)", + "Claude-3-Sonnet-20240229 (FC)", "https://www.anthropic.com/news/claude-3-family", "Anthropic", "Proprietary", @@ -164,7 +164,7 @@ "Proprietary", ], "claude-3-haiku-20240307-FC": [ - "Claude-3-Haiku-20240307 (FC tools-2024-04-04)", + "Claude-3-Haiku-20240307 (FC)", "https://www.anthropic.com/news/claude-3-family", "Anthropic", "Proprietary", @@ -176,7 +176,7 @@ "Proprietary", ], "claude-3-5-haiku-20241022-FC": [ - "claude-3.5-haiku-20241022 (FC tools-2024-04-04)", + "claude-3.5-haiku-20241022 (FC)", "https://www.anthropic.com/news/3-5-models-and-computer-use", "Anthropic", "Proprietary",