diff --git a/berkeley-function-call-leaderboard/CHANGELOG.md b/berkeley-function-call-leaderboard/CHANGELOG.md index 3834cfb8c..882d4577b 100644 --- a/berkeley-function-call-leaderboard/CHANGELOG.md +++ b/berkeley-function-call-leaderboard/CHANGELOG.md @@ -2,6 +2,11 @@ All notable changes to the Berkeley Function Calling Leaderboard will be documented in this file. +- [Nov 19, 2024] [#750](https://github.com/ShishirPatil/gorilla/pull/750): Add the following new models to the leaderboard: + - `claude-3-5-haiku-20241022` + - `claude-3-5-haiku-20241022-FC` + - `claude-3-5-sonnet-20241022` + - `claude-3-5-sonnet-20241022-FC` - [Nov 18, 2024] [#736](https://github.com/ShishirPatil/gorilla/pull/736): Add the option to additionally log the evaluation results to [WandB](https://github.com/wandb/wandb) artifacts. User can enable this feature by providing the entity and project name in `WANDB_BFCL_PROJECT` in the `.env` file. - [Nov 18, 2024] [#768](https://github.com/ShishirPatil/gorilla/pull/768), [#770](https://github.com/ShishirPatil/gorilla/pull/770): Resolve issues in Gemini models (FC mode) related to handling scenarios with no tools available and cases where the model output is empty. - [Nov 17, 2024] [#767](https://github.com/ShishirPatil/gorilla/pull/767): Fix price and latency calculation. A merge conflict results in a duplicate line, and counting the input and output token for each entry multiple times. diff --git a/berkeley-function-call-leaderboard/README.md b/berkeley-function-call-leaderboard/README.md index 579c4dee7..24a69a719 100644 --- a/berkeley-function-call-leaderboard/README.md +++ b/berkeley-function-call-leaderboard/README.md @@ -150,8 +150,10 @@ Below is _a table of models we support_ to run our leaderboard evaluation agains |gorilla-openfunctions-v2 | Function Calling| |claude-3-{opus-20240229,sonnet-20240229,haiku-20240307}-FC | Function Calling | |claude-3-{opus-20240229,sonnet-20240229,haiku-20240307} | Prompt | -|claude-3-5-sonnet-20240620-FC | Function Calling | -|claude-3-5-sonnet-20240620 | Prompt | +|claude-3-5-sonnet-{20240620,20241022}-FC | Function Calling | +|claude-3-5-sonnet-{20240620,20241022} | Prompt | +|claude-3-5-haiku-20241022-FC | Function Calling | +|claude-3-5-haiku-20241022 | Prompt | |claude-{2.1,instant-1.2}| Prompt| |command-r-plus-FC | Function Calling| |command-r-plus | Prompt| diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/model_metadata.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/model_metadata.py index 5f2365c5c..65d604042 100644 --- a/berkeley-function-call-leaderboard/bfcl/eval_checker/model_metadata.py +++ b/berkeley-function-call-leaderboard/bfcl/eval_checker/model_metadata.py @@ -92,7 +92,7 @@ "Apache 2.0", ], "claude-3-opus-20240229-FC": [ - "Claude-3-Opus-20240229 (FC tools-2024-04-04)", + "Claude-3-Opus-20240229 (FC)", "https://www.anthropic.com/news/claude-3-family", "Anthropic", "Proprietary", @@ -152,7 +152,7 @@ "Proprietary", ], "claude-3-sonnet-20240229-FC": [ - "Claude-3-Sonnet-20240229 (FC tools-2024-04-04)", + "Claude-3-Sonnet-20240229 (FC)", "https://www.anthropic.com/news/claude-3-family", "Anthropic", "Proprietary", @@ -164,7 +164,7 @@ "Proprietary", ], "claude-3-haiku-20240307-FC": [ - "Claude-3-Haiku-20240307 (FC tools-2024-04-04)", + "Claude-3-Haiku-20240307 (FC)", "https://www.anthropic.com/news/claude-3-family", "Anthropic", "Proprietary", @@ -175,6 +175,18 @@ "Anthropic", "Proprietary", ], + "claude-3-5-haiku-20241022-FC": [ + "claude-3.5-haiku-20241022 (FC)", + "https://www.anthropic.com/news/3-5-models-and-computer-use", + "Anthropic", + "Proprietary", + ], + "claude-3-5-haiku-20241022": [ + "claude-3.5-haiku-20241022 (Prompt)", + "https://www.anthropic.com/news/3-5-models-and-computer-use", + "Anthropic", + "Proprietary", + ], "claude-3-5-sonnet-20240620-FC": [ "Claude-3.5-Sonnet-20240620 (FC)", "https://www.anthropic.com/news/claude-3-5-sonnet", @@ -187,6 +199,18 @@ "Anthropic", "Proprietary", ], + "claude-3-5-sonnet-20241022-FC": [ + "Claude-3.5-Sonnet-20241022 (FC)", + "https://www.anthropic.com/news/3-5-models-and-computer-use", + "Anthropic", + "Proprietary", + ], + "claude-3-5-sonnet-20241022": [ + "Claude-3.5-Sonnet-20241022 (Prompt)", + "https://www.anthropic.com/news/3-5-models-and-computer-use", + "Anthropic", + "Proprietary", + ], "gpt-3.5-turbo-0125-FC": [ "GPT-3.5-Turbo-0125 (FC)", "https://platform.openai.com/docs/models/gpt-3-5-turbo", @@ -650,10 +674,14 @@ "claude-3-opus-20240229": 15, "claude-3-sonnet-20240229-FC": 3, "claude-3-sonnet-20240229": 3, - "claude-3-haiku-20240307-FC": 0.25, - "claude-3-haiku-20240307": 0.25, "claude-3-5-sonnet-20240620-FC": 3, "claude-3-5-sonnet-20240620": 3, + "claude-3-5-sonnet-20241022-FC": 3, + "claude-3-5-sonnet-20241022": 3, + "claude-3-haiku-20240307-FC": 0.25, + "claude-3-haiku-20240307": 0.25, + "claude-3-5-haiku-20241022-FC": 1, + "claude-3-5-haiku-20241022": 1, "claude-2.1": 8, "claude-instant-1.2": 0.8, "open-mistral-nemo-2407": 0.3, @@ -710,8 +738,12 @@ "claude-3-sonnet-20240229": 15, "claude-3-5-sonnet-20240620-FC": 15, "claude-3-5-sonnet-20240620": 15, + "claude-3-5-sonnet-20241022-FC": 15, + "claude-3-5-sonnet-20241022": 15, "claude-3-haiku-20240307-FC": 1.25, "claude-3-haiku-20240307": 1.25, + "claude-3-5-haiku-20241022-FC": 5, + "claude-3-5-haiku-20241022": 5, "claude-2.1": 24, "claude-instant-1.2": 2.4, "open-mistral-nemo-2407": 0.3, diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/constant.py b/berkeley-function-call-leaderboard/bfcl/model_handler/constant.py index 4a8651c9a..d877c9b4d 100644 --- a/berkeley-function-call-leaderboard/bfcl/model_handler/constant.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/constant.py @@ -122,8 +122,10 @@ "gpt-3.5-turbo-0125-FC", "claude-3-opus-20240229-FC", "claude-3-sonnet-20240229-FC", - "claude-3-haiku-20240307-FC", "claude-3-5-sonnet-20240620-FC", + "claude-3-5-sonnet-20241022-FC", + "claude-3-haiku-20240307-FC", + "claude-3-5-haiku-20241022-FC", "open-mistral-nemo-2407-FC", "open-mixtral-8x22b-FC", "mistral-large-2407-FC", diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/handler_map.py b/berkeley-function-call-leaderboard/bfcl/model_handler/handler_map.py index dac419b88..0ecec998e 100644 --- a/berkeley-function-call-leaderboard/bfcl/model_handler/handler_map.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/handler_map.py @@ -44,10 +44,14 @@ "claude-3-opus-20240229-FC": ClaudeHandler, "claude-3-sonnet-20240229": ClaudeHandler, "claude-3-sonnet-20240229-FC": ClaudeHandler, - "claude-3-haiku-20240307": ClaudeHandler, - "claude-3-haiku-20240307-FC": ClaudeHandler, "claude-3-5-sonnet-20240620": ClaudeHandler, "claude-3-5-sonnet-20240620-FC": ClaudeHandler, + "claude-3-5-sonnet-20241022": ClaudeHandler, + "claude-3-5-sonnet-20241022-FC": ClaudeHandler, + "claude-3-haiku-20240307": ClaudeHandler, + "claude-3-haiku-20240307-FC": ClaudeHandler, + "claude-3-5-haiku-20241022": ClaudeHandler, + "claude-3-5-haiku-20241022-FC": ClaudeHandler, "open-mistral-nemo-2407": MistralHandler, "open-mistral-nemo-2407-FC": MistralHandler, "open-mixtral-8x22b": MistralHandler, diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/claude.py b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/claude.py index f63984076..bc98c450b 100644 --- a/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/claude.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/claude.py @@ -92,7 +92,9 @@ def _query_FC(self, inference_data: dict): return self.client.beta.prompt_caching.messages.create( model=self.model_name.strip("-FC"), - max_tokens=(8192 if "claude-3-5-sonnet-20240620" in self.model_name else 4096), + max_tokens=( + 8192 if "claude-3-5" in self.model_name else 4096 + ), # 3.5 Sonnet has a higher max token limit tools=inference_data["tools"], messages=messages, ) diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/gemini.py b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/gemini.py index 805fa0112..98b331539 100644 --- a/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/gemini.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/gemini.py @@ -1,4 +1,3 @@ -import logging import os import vertexai @@ -29,7 +28,6 @@ Tool, ) -logging.basicConfig(level=logging.INFO) class GeminiHandler(BaseHandler): def __init__(self, model_name, temperature) -> None: diff --git a/berkeley-function-call-leaderboard/pyproject.toml b/berkeley-function-call-leaderboard/pyproject.toml index c8935c8a7..705a7538f 100644 --- a/berkeley-function-call-leaderboard/pyproject.toml +++ b/berkeley-function-call-leaderboard/pyproject.toml @@ -24,7 +24,7 @@ dependencies = [ "tree-sitter-javascript==0.21.4", "openai==1.46.0", "mistralai==1.1.0", - "anthropic==0.37.1", + "anthropic==0.39.0", "cohere==5.5.8", "typer>=0.12.5", "tabulate>=0.9.0",