diff --git a/berkeley-function-call-leaderboard/CHANGELOG.md b/berkeley-function-call-leaderboard/CHANGELOG.md index bafab9543..7fec3e11c 100644 --- a/berkeley-function-call-leaderboard/CHANGELOG.md +++ b/berkeley-function-call-leaderboard/CHANGELOG.md @@ -12,6 +12,17 @@ All notable changes to the Berkeley Function Calling Leaderboard will be documen - `microsoft/Phi-3-mini-128k-instruct` - `microsoft/Phi-3-mini-4k-instruct` - [Sept 25, 2024] [#660](https://github.com/ShishirPatil/gorilla/pull/660): Bug fix in `parse_nested_value` function to handle nested dictionary values properly. +- [Sept 24, 2024] [#648](https://github.com/ShishirPatil/gorilla/pull/648): Add the following new models to the leaderboard: + - `gemini-1.5-pro-002` + - `gemini-1.5-pro-002-FC` + - `gemini-1.5-pro-001` + - `gemini-1.5-pro-001-FC` + - `gemini-1.5-flash-002` + - `gemini-1.5-flash-002-FC` + - `gemini-1.5-flash-001` + - `gemini-1.5-flash-001-FC` + - `gemini-1.0-pro-002` + - `gemini-1.0-pro-002-FC` - [Sept 19, 2024] [#644](https://github.com/ShishirPatil/gorilla/pull/644): BFCL V3 release: - Introduce new multi-turn dataset and state-based evaluation metric - Separate ast_checker and executable_checker for readability diff --git a/berkeley-function-call-leaderboard/README.md b/berkeley-function-call-leaderboard/README.md index edb3f6829..5d06a026b 100644 --- a/berkeley-function-call-leaderboard/README.md +++ b/berkeley-function-call-leaderboard/README.md @@ -132,9 +132,12 @@ Below is _a table of models we support_ to run our leaderboard evaluation agains |databrick-dbrx-instruct | Prompt| |deepseek-ai/deepseek-coder-6.7b-instruct 💻| Prompt| |firefunction-{v1,v2}-FC | Function Calling| -|gemini-1.0-pro-FC | Function Calling| -|gemini-1.5-pro-preview-{0409,0514}-FC | Function Calling| -|gemini-1.5-flash-preview-0514-FC | Function Calling| +|gemini-1.0-pro-{001,002}-FC | Function Calling| +|gemini-1.0-pro-{001,002} | Prompt| +|gemini-1.5-pro-{001,002}-FC | Function Calling| +|gemini-1.5-pro-{001,002} | Prompt| +|gemini-1.5-flash-{001,002}-FC | Function Calling| +|gemini-1.5-flash-{001,002} | Prompt| |glaiveai/glaive-function-calling-v1 💻| Function Calling| |gpt-3.5-turbo-0125-FC| Function Calling| |gpt-3.5-turbo-0125| Prompt| diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/model_metadata.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/model_metadata.py index 6c16a01e0..f82b9ded4 100644 --- a/berkeley-function-call-leaderboard/bfcl/eval_checker/model_metadata.py +++ b/berkeley-function-call-leaderboard/bfcl/eval_checker/model_metadata.py @@ -263,27 +263,63 @@ "Fireworks", "Apache 2.0", ], - "gemini-1.5-pro-preview-0514-FC": [ - "Gemini-1.5-Pro-Preview-0514 (FC)", + "gemini-1.5-pro-002": [ + "Gemini-1.5-Pro-002 (Prompt)", "https://deepmind.google/technologies/gemini/pro/", "Google", "Proprietary", ], - "gemini-1.5-flash-preview-0514-FC": [ - "Gemini-1.5-Flash-Preview-0514 (FC)", + "gemini-1.5-pro-002-FC": [ + "Gemini-1.5-Pro-002 (FC)", + "https://deepmind.google/technologies/gemini/pro/", + "Google", + "Proprietary", + ], + "gemini-1.5-pro-001": [ + "Gemini-1.5-Pro-001 (Prompt)", + "https://deepmind.google/technologies/gemini/pro/", + "Google", + "Proprietary", + ], + "gemini-1.5-pro-001-FC": [ + "Gemini-1.5-Pro-001 (FC)", + "https://deepmind.google/technologies/gemini/pro/", + "Google", + "Proprietary", + ], + "gemini-1.5-flash-002": [ + "Gemini-1.5-Flash-002 (Prompt)", + "https://deepmind.google/technologies/gemini/flash/", + "Google", + "Proprietary", + ], + "gemini-1.5-flash-002-FC": [ + "Gemini-1.5-Flash-002 (FC)", "https://deepmind.google/technologies/gemini/flash/", "Google", "Proprietary", ], - "gemini-1.5-pro-preview-0409-FC": [ - "Gemini-1.5-Pro-Preview-0409 (FC)", - "https://deepmind.google/technologies/gemini/#introduction", + "gemini-1.5-flash-001": [ + "Gemini-1.5-Flash-001 (Prompt)", + "https://deepmind.google/technologies/gemini/flash/", "Google", "Proprietary", ], - "gemini-1.0-pro-FC": [ - "Gemini-1.0-Pro-001 (FC)", - "https://deepmind.google/technologies/gemini/#introduction", + "gemini-1.5-flash-001-FC": [ + "Gemini-1.5-Flash-001 (FC)", + "https://deepmind.google/technologies/gemini/flash/", + "Google", + "Proprietary", + ], + "gemini-1.0-pro-002": [ + "Gemini-1.0-Pro-002 (Prompt)", + "https://deepmind.google/technologies/gemini/pro/", + "Google", + "Proprietary", + ], + "gemini-1.0-pro-002-FC": [ + "Gemini-1.0-Pro-002 (FC)", + "https://deepmind.google/technologies/gemini/pro/", "Google", "Proprietary", ], @@ -539,10 +575,16 @@ "gpt-4-0613-FC": 30, "gpt-3.5-turbo-0125": 0.5, "gpt-3.5-turbo-0125-FC": 0.5, - "gemini-1.0-pro-FC": 0.5, - "gemini-1.5-pro-preview-0409-FC": 3.5, - "gemini-1.5-pro-preview-0514-FC": 3.5, - "gemini-1.5-flash-preview-0514-FC": 0.35, + "gemini-1.5-pro-002": 1.25, + "gemini-1.5-pro-002-FC": 1.25, + "gemini-1.5-pro-001": 1.25, + "gemini-1.5-pro-001-FC": 1.25, + "gemini-1.5-flash-002": 0.075 , + "gemini-1.5-flash-002-FC": 0.075 , + "gemini-1.5-flash-001": 0.075 , + "gemini-1.5-flash-001-FC": 0.075 , + "gemini-1.0-pro-002": 0.5, + "gemini-1.0-pro-002-FC": 0.5, "databricks-dbrx-instruct": 2.25, "command-r-plus-FC": 3, "command-r-plus": 3, @@ -591,10 +633,16 @@ "gpt-4-0613-FC": 60, "gpt-3.5-turbo-0125": 1.5, "gpt-3.5-turbo-0125-FC": 1.5, - "gemini-1.0-pro-FC": 1.5, - "gemini-1.5-pro-preview-0409-FC": 10.50, - "gemini-1.5-pro-preview-0514-FC": 10.50, - "gemini-1.5-flash-preview-0514-FC": 0.53, + "gemini-1.5-pro-002": 5, + "gemini-1.5-pro-002-FC": 5, + "gemini-1.5-pro-001": 5, + "gemini-1.5-pro-001-FC": 5, + "gemini-1.5-flash-002": 0.30, + "gemini-1.5-flash-002-FC": 0.30, + "gemini-1.5-flash-001": 0.30, + "gemini-1.5-flash-001-FC": 0.30, + "gemini-1.0-pro-002": 1.5, + "gemini-1.0-pro-002-FC": 1.5, "databricks-dbrx-instruct": 6.75, "command-r-plus-FC": 15, "command-r-plus": 15, diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/constant.py b/berkeley-function-call-leaderboard/bfcl/model_handler/constant.py index f70e6da8e..5b25b8eac 100644 --- a/berkeley-function-call-leaderboard/bfcl/model_handler/constant.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/constant.py @@ -128,10 +128,11 @@ "mistral-large-2407-FC", "mistral-small-2402-FC", "mistral-small-2402-FC", - "gemini-1.0-pro-FC", - "gemini-1.5-pro-preview-0409-FC", - "gemini-1.5-pro-preview-0514-FC", - "gemini-1.5-flash-preview-0514-FC", + "gemini-1.5-pro-002-FC", + "gemini-1.5-pro-001-FC", + "gemini-1.5-flash-002-FC", + "gemini-1.5-flash-001-FC", + "gemini-1.0-pro-002-FC", "meetkai/functionary-small-v3.1-FC", "meetkai/functionary-small-v3.2-FC", "meetkai/functionary-medium-v3.1-FC", diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/handler_map.py b/berkeley-function-call-leaderboard/bfcl/model_handler/handler_map.py index 55de93e04..236716e02 100644 --- a/berkeley-function-call-leaderboard/bfcl/model_handler/handler_map.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/handler_map.py @@ -56,9 +56,16 @@ "firefunction-v1-FC": FireworksHandler, "firefunction-v2-FC": FireworksHandler, "Nexusflow-Raven-v2": NexusHandler, - "gemini-1.0-pro-FC": GeminiHandler, - "gemini-1.5-pro-preview-0514-FC": GeminiHandler, - "gemini-1.5-flash-preview-0514-FC": GeminiHandler, + "gemini-1.5-pro-002": GeminiHandler, + "gemini-1.5-pro-002-FC": GeminiHandler, + "gemini-1.5-pro-001": GeminiHandler, + "gemini-1.5-pro-001-FC": GeminiHandler, + "gemini-1.5-flash-002": GeminiHandler, + "gemini-1.5-flash-002-FC": GeminiHandler, + "gemini-1.5-flash-001": GeminiHandler, + "gemini-1.5-flash-001-FC": GeminiHandler, + "gemini-1.0-pro-002": GeminiHandler, + "gemini-1.0-pro-002-FC": GeminiHandler, "meetkai/functionary-small-v3.2-FC": FunctionaryHandler, "meetkai/functionary-medium-v3.1-FC": FunctionaryHandler, "databricks-dbrx-instruct": DatabricksHandler, @@ -105,7 +112,8 @@ # "gpt-4-0613": OpenAIHandler, # "claude-2.1": ClaudeHandler, # "claude-instant-1.2": ClaudeHandler, - # "gemini-1.5-pro-preview-0409-FC": GeminiHandler, + # "gemini-1.0-pro-001": GeminiHandler, + # "gemini-1.0-pro-001-FC": GeminiHandler, # "meetkai/functionary-small-v3.1-FC": FunctionaryHandler, # "mistral-tiny-2312": MistralHandler, # "glaiveai/glaive-function-calling-v1": GlaiveHandler, diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/gemini.py b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/gemini.py index 8bcba3545..150d0f887 100644 --- a/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/gemini.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/proprietary_model/gemini.py @@ -7,14 +7,16 @@ from bfcl.model_handler.model_style import ModelStyle from bfcl.model_handler.utils import ( convert_to_tool, + default_decode_ast_prompting, + default_decode_execute_prompting, extract_system_prompt, format_execution_results_prompting, func_doc_language_specific_pre_processing, system_prompt_pre_processing_chat_model, ) -from google.protobuf.struct_pb2 import ( - ListValue, # This import should eventually be removed. See comment in the `decode_execute` method below -) + +# This import from struct_pb2 should eventually be removed. See comment in the `_handle_struct_values` and `_handle_list_values` method below +from google.protobuf.struct_pb2 import ListValue, Struct from vertexai.generative_models import ( Content, FunctionDeclaration, @@ -49,69 +51,149 @@ def _substitute_prompt_role(prompts: list[dict]) -> list[dict]: return prompts - def decode_ast(self, result, language="Python"): - if type(result) is not list: - result = [result] - decoded_output = [] - for invoked_function in result: - name = list(invoked_function.keys())[0] - params = json.loads(invoked_function[name]) - decoded_output.append({name: params}) - return decoded_output - - def decode_execute(self, result): - func_call_list = [] - for function_call in result: - for func_name, func_args in function_call.items(): - # Note: Below is a workaround for a bug in the Vertex AI library - # Accoding to the Vertex AI documentation https://ai.google.dev/gemini-api/docs/function-calling/tutorial?lang=python, cited below: - """ - # Set the model up with tools. - house_fns = [power_disco_ball, start_music, dim_lights] - - model = genai.GenerativeModel(model_name="gemini-1.5-flash", tools=house_fns) + def _handle_struct_values(self, input): + # Note: Below is a workaround for a bug in the Vertex AI library + # Accoding to the Vertex AI documentation https://ai.google.dev/gemini-api/docs/function-calling/tutorial?lang=python, cited below: + """ + # Set the model up with tools. + house_fns = [power_disco_ball, start_music, dim_lights] + + model = genai.GenerativeModel(model_name="gemini-1.5-flash", tools=house_fns) + + # Call the API. + chat = model.start_chat() + response = chat.send_message("Turn this place into a party!") + + # Print out each of the function calls requested from this single call. + for part in response.parts: + if fn := part.function_call: + args = ", ".join(f"{key}={val}" for key, val in fn.args.items()) + print(f"{fn.name}({args})") + """ + # ", ".join(f"{key}={val}" for key, val in fn.args.items()) should get the function call arguments in a ready-to-execute format + # However, the above code snippet will not work as expected when `val` is a ListValue object, and it would further cause the json serialization error when writing the result to a file + """ + # This is a typical ListValue object that is causing the issue. It is a list of 4 string values + values { + string_value: "driver" + } + values { + string_value: "passenger" + } + values { + string_value: "rear_left" + } + values { + string_value: "rear_right" + } + """ + # To fix this, we need to unpack the ListValue object to a list of string values before joining them + # So the above example gets converted to: + """ + ["driver", "passenger", "rear_left", "rear_right"] + """ + # This will be the temporary fix until the bug in the Vertex AI library is fixed + # Convert it to dictionary for easier manipulation + input = {k: v for k, v in input.items()} + for k, v in input.items(): + if type(v) == ListValue: + input[k] = self._handle_list_values(v) + elif type(v) == Struct: + input[k] = self._handle_struct_values(v) + return input + + def _handle_list_values(self, input): + """ + @typing.final + class Value(google.protobuf.message.Message): + "`Value` represents a dynamically typed value which can be either + null, a number, a string, a boolean, a recursive struct value, or a + list of values. A producer of value is expected to set one of these + variants. Absence of any variant indicates an error. + + The JSON representation for `Value` is JSON value. + " + + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + NULL_VALUE_FIELD_NUMBER: builtins.int + NUMBER_VALUE_FIELD_NUMBER: builtins.int + STRING_VALUE_FIELD_NUMBER: builtins.int + BOOL_VALUE_FIELD_NUMBER: builtins.int + STRUCT_VALUE_FIELD_NUMBER: builtins.int + LIST_VALUE_FIELD_NUMBER: builtins.int + null_value: global___NullValue.ValueType + "Represents a null value." + number_value: builtins.float + "Represents a double value." + string_value: builtins.str + "Represents a string value." + bool_value: builtins.bool + "Represents a boolean value." + @property + def struct_value(self) -> global___Struct: + "Represents a structured value." + + @property + def list_value(self) -> global___ListValue: + "Represents a repeated `Value`." + + def __init__( + self, + *, + null_value: global___NullValue.ValueType | None = ..., + number_value: builtins.float | None = ..., + string_value: builtins.str | None = ..., + bool_value: builtins.bool | None = ..., + struct_value: global___Struct | None = ..., + list_value: global___ListValue | None = ..., + ) -> None: ... + def HasField(self, field_name: typing.Literal["bool_value", b"bool_value", "kind", b"kind", "list_value", b"list_value", "null_value", b"null_value", "number_value", b"number_value", "string_value", b"string_value", "struct_value", b"struct_value"]) -> builtins.bool: ... + def ClearField(self, field_name: typing.Literal["bool_value", b"bool_value", "kind", b"kind", "list_value", b"list_value", "null_value", b"null_value", "number_value", b"number_value", "string_value", b"string_value", "struct_value", b"struct_value"]) -> None: ... + def WhichOneof(self, oneof_group: typing.Literal["kind", b"kind"]) -> typing.Literal["null_value", "number_value", "string_value", "bool_value", "struct_value", "list_value"] | None: ... + """ + parsed_list_result = [] + for item in input.values: + field_name = item.WhichOneof("kind") + if field_name == "null_value": + value = item.null_value + elif field_name == "number_value": + value = item.number_value + elif field_name == "string_value": + value = item.string_value + elif field_name == "bool_value": + value = item.bool_value + elif field_name == "struct_value": + value = self._handle_struct_values(item.struct_value) + elif field_name == "list_value": + value = self._handle_list_values(item.list_value) + else: + value = None + parsed_list_result.append(value) - # Call the API. - chat = model.start_chat() - response = chat.send_message("Turn this place into a party!") + return parsed_list_result - # Print out each of the function calls requested from this single call. - for part in response.parts: - if fn := part.function_call: - args = ", ".join(f"{key}={val}" for key, val in fn.args.items()) - print(f"{fn.name}({args})") - """ - # ", ".join(f"{key}={val}" for key, val in fn.args.items()) should get the function call arguments in a ready-to-execute format - # However, the above code snippet will not work as expected when `val` is a ListValue object, and it would further cause the json serialization error when writing the result to a file - """ - # This is a typical ListValue object that is causing the issue. It is a list of 4 string values - values { - string_value: "driver" - } - values { - string_value: "passenger" - } - values { - string_value: "rear_left" - } - values { - string_value: "rear_right" - } - """ - # To fix this, we need to unpack the ListValue object to a list of string values before joining them - # So the above example gets converted to: - """ - ["driver", "passenger", "rear_left", "rear_right"] - """ - # This will be the temporary fix until the bug in the Vertex AI library is fixed - for k, v in func_args.items(): - if type(v) == ListValue: - func_args[k] = [item.string_value for item in v.values] + def decode_ast(self, result, language="Python"): + if "FC" not in self.model_name: + result = result.replace("```tool_code\n", "").replace("\n```", "") + return default_decode_ast_prompting(result, language) + else: + if type(result) is not list: + result = [result] + return result - func_call_list.append( - f"{func_name}({','.join([f'{k}={repr(v)}' for k, v in func_args.items()])})" - ) - return func_call_list + def decode_execute(self, result): + if "FC" not in self.model_name: + result = result.replace("```tool_code\n", "").replace("\n```", "") + return default_decode_execute_prompting(result) + else: + func_call_list = [] + for function_call in result: + for func_name, func_args in function_call.items(): + func_call_list.append( + f"{func_name}({','.join([f'{k}={repr(v)}' for k, v in func_args.items()])})" + ) + return func_call_list #### FC methods #### @@ -197,7 +279,9 @@ def _parse_query_response_FC(self, api_response: any) -> dict: if part.function_call and part.function_call.name: part_func_name = part.function_call.name part_func_args = part.function_call.args - part_func_args_dict = {k: v for k, v in part_func_args.items()} + # Bug patch for the Vertex AI library + part_func_args_dict = self._handle_struct_values(part_func_args) + fc_parts.append({part_func_name: part_func_args_dict}) tool_call_func_names.append(part_func_name) else: @@ -230,7 +314,6 @@ def text(self) -> str: "output_token": api_response.usage_metadata.candidates_token_count, } - # TODO: Is it better to do it in query method? def add_first_turn_message_FC( self, inference_data: dict, first_turn_message: list[dict] ) -> dict: @@ -342,8 +425,13 @@ def _parse_query_response_prompting(self, api_response: any) -> dict: """TypeError: argument of type 'Part' is not iterable""" # So again, we need to directly access the `api_response.candidates[0].content.parts[0]._raw_part.text` attribute to get the text content of the part # This is a workaround for this bug, until the bug is fixed + + if len(api_response.candidates[0].content.parts) > 0: + model_responses = api_response.candidates[0].content.parts[0]._raw_part.text + else: + model_responses = "The model did not return any response." return { - "model_responses": api_response.candidates[0].content.parts[0]._raw_part.text, + "model_responses": model_responses, "input_token": api_response.usage_metadata.prompt_token_count, "output_token": api_response.usage_metadata.candidates_token_count, } @@ -386,8 +474,11 @@ def _add_execution_results_prompting( formatted_results_message = format_execution_results_prompting( inference_data, execution_results, model_response_data ) - inference_data["message"].append( - {"role": "user", "content": formatted_results_message} + tool_message = Content( + role="user", + parts=[ + Part.from_text(formatted_results_message), + ], ) - + inference_data["message"].append(tool_message) return inference_data diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/utils.py b/berkeley-function-call-leaderboard/bfcl/model_handler/utils.py index fd8c61acc..abcce0129 100644 --- a/berkeley-function-call-leaderboard/bfcl/model_handler/utils.py +++ b/berkeley-function-call-leaderboard/bfcl/model_handler/utils.py @@ -82,12 +82,15 @@ def convert_to_tool(functions, mapping, model_style): if model_style == ModelStyle.Google: # Remove fields that are not supported by Gemini. + # No `optional` field in function schema. + if "optional" in item["parameters"]: + del item["parameters"]["optional"] for params in item["parameters"]["properties"].values(): # No `default` field in Google's schema. if "default" in params: params["description"] += f" Default is: {str(params['default'])}." del params["default"] - # No `optional` field. + # No `optional` field in parameter schema as well. if "optional" in params: params["description"] += f" Optional: {str(params['optional'])}." del params["optional"] @@ -95,12 +98,24 @@ def convert_to_tool(functions, mapping, model_style): if "maximum" in params: params["description"] += f" Maximum value: {str(params['maximum'])}." del params["maximum"] + # No `minItems` field. + if "minItems" in params: + params["description"] += f" Minimum number of items: {str(params['minItems'])}." + del params["minItems"] + # No `maxItems` field. + if "maxItemsmax" in params: + params["description"] += f" Maximum number of items: {str(params['maxItems'])}." + del params["maxItems"] # No `additionalProperties` field. if "additionalProperties" in params: params[ "description" ] += f" Additional properties: {str(params['additionalProperties'])}." del params["additionalProperties"] + # Only `enum` field when the type is `string`. + if "enum" in params and params["type"] != "string": + params["description"] += f" Enum values: {str(params['enum'])}." + del params["enum"] if model_style == ModelStyle.COHERE: if os.getenv("USE_COHERE_OPTIMIZATION") == "True": @@ -723,26 +738,26 @@ def format_execution_results_prompting( def default_decode_ast_prompting(result, language="Python"): - func = result - if " " == func[0]: - func = func[1:] - if not func.startswith("["): - func = "[" + func - if not func.endswith("]"): - func = func + "]" - decoded_output = ast_parse(func, language) + result = result.strip() + result = result.rstrip("\n") + result = result.lstrip('\n') + if not result.startswith("["): + result = "[" + result + if not result.endswith("]"): + result = result + "]" + decoded_output = ast_parse(result, language) return decoded_output def default_decode_execute_prompting(result): - func = result - if " " == func[0]: - func = func[1:] - if not func.startswith("["): - func = "[" + func - if not func.endswith("]"): - func = func + "]" - decoded_output = ast_parse(func) + result = result.strip() + result = result.rstrip("\n") + result = result.lstrip('\n') + if not result.startswith("["): + result = "[" + result + if not result.endswith("]"): + result = result + "]" + decoded_output = ast_parse(result) return decoded_output_to_execution_list(decoded_output)