From db53655a987760e407a84fd79fa7361a4d2531e7 Mon Sep 17 00:00:00 2001 From: "Huanzhi (Hans) Mao" Date: Fri, 9 Aug 2024 21:09:16 -0700 Subject: [PATCH] [BFCL] Set Model Temperature to 0.001 for All Models (#574) The current model response generation script uses a default temperature of 0.7 for inference. This introduces some degree of randomness into the model output generation, leading to potential variability in the evaluation scores from run to run. For benchmarking purposes, we set it to 0.001 for consistency and reliability of the evaluation results. resolves #500 , resolves #562 This will affect the leaderboard score. We will update it shortly. --------- Co-authored-by: Shishir Patil <30296397+ShishirPatil@users.noreply.github.com> --- berkeley-function-call-leaderboard/README.md | 1 + .../model_handler/arctic_handler.py | 3 ++- .../model_handler/claude_fc_handler.py | 2 +- .../model_handler/claude_prompt_handler.py | 2 +- .../model_handler/cohere_handler.py | 2 +- .../model_handler/databricks_handler.py | 7 ++----- .../model_handler/deepseek_handler.py | 2 +- .../model_handler/firework_ai_handler.py | 15 +-------------- .../model_handler/functionary_handler.py | 7 ++----- .../model_handler/gemini_handler.py | 2 +- .../model_handler/gemma_handler.py | 2 +- .../model_handler/glaive_handler.py | 2 +- .../model_handler/glm_handler.py | 2 +- .../model_handler/gorilla_handler.py | 2 +- .../model_handler/gpt_handler.py | 2 +- .../model_handler/granite_handler.py | 3 +-- .../model_handler/handler.py | 2 +- .../model_handler/hermes_handler.py | 2 +- .../model_handler/llama_handler.py | 2 +- .../model_handler/mistral_handler.py | 2 +- .../model_handler/nexus_handler.py | 1 - .../model_handler/nvidia_handler.py | 8 +++----- .../model_handler/oss_handler.py | 2 +- .../model_handler/xlam_handler.py | 1 - .../model_handler/yi_handler.py | 2 +- .../openfunctions_evaluation.py | 2 +- 26 files changed, 29 insertions(+), 51 deletions(-) diff --git a/berkeley-function-call-leaderboard/README.md b/berkeley-function-call-leaderboard/README.md index efaed6f5b4..4a056dec97 100644 --- a/berkeley-function-call-leaderboard/README.md +++ b/berkeley-function-call-leaderboard/README.md @@ -220,6 +220,7 @@ Some companies have proposed some optimization strategies in their models' handl ## Changelog +* [August 8, 2024] [#574](https://github.com/ShishirPatil/gorilla/pull/574): Set temperature to 0.001 for all models for consistency and reproducibility. * [August 5, 2024] [#568](https://github.com/ShishirPatil/gorilla/pull/568): Rephrase the question prompt for the `executable_parallel_function` category to remove potentially misleading information implying multi-turn function calls. * [August 4, 2024] [#557](https://github.com/ShishirPatil/gorilla/pull/557): Bug fix in the possible answers. * simple: 7 affected diff --git a/berkeley-function-call-leaderboard/model_handler/arctic_handler.py b/berkeley-function-call-leaderboard/model_handler/arctic_handler.py index fdfd9d2196..d0d9852636 100644 --- a/berkeley-function-call-leaderboard/model_handler/arctic_handler.py +++ b/berkeley-function-call-leaderboard/model_handler/arctic_handler.py @@ -2,8 +2,9 @@ from model_handler.utils import ast_parse class ArcticHandler(NvidiaHandler): - def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: + def __init__(self, model_name, temperature=0.001, top_p=1, max_tokens=1000) -> None: super().__init__(model_name, temperature, top_p, max_tokens) + def decode_ast(self, result, language="Python"): result = result.replace("\n", "") if not result.startswith("["): diff --git a/berkeley-function-call-leaderboard/model_handler/claude_fc_handler.py b/berkeley-function-call-leaderboard/model_handler/claude_fc_handler.py index 3b575d80e7..1e5fc66c66 100644 --- a/berkeley-function-call-leaderboard/model_handler/claude_fc_handler.py +++ b/berkeley-function-call-leaderboard/model_handler/claude_fc_handler.py @@ -18,7 +18,7 @@ class ClaudeFCHandler(BaseHandler): - def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: + def __init__(self, model_name, temperature=0.001, top_p=1, max_tokens=1000) -> None: super().__init__(model_name, temperature, top_p, max_tokens) self.model_style = ModelStyle.Anthropic_Prompt diff --git a/berkeley-function-call-leaderboard/model_handler/claude_prompt_handler.py b/berkeley-function-call-leaderboard/model_handler/claude_prompt_handler.py index ad5c180c29..5b641c4161 100644 --- a/berkeley-function-call-leaderboard/model_handler/claude_prompt_handler.py +++ b/berkeley-function-call-leaderboard/model_handler/claude_prompt_handler.py @@ -19,7 +19,7 @@ class ClaudePromptingHandler(BaseHandler): - def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: + def __init__(self, model_name, temperature=0.001, top_p=1, max_tokens=1000) -> None: super().__init__(model_name, temperature, top_p, max_tokens) self.model_style = ModelStyle.Anthropic_Prompt diff --git a/berkeley-function-call-leaderboard/model_handler/cohere_handler.py b/berkeley-function-call-leaderboard/model_handler/cohere_handler.py index b573164bfa..cb51dfb1f4 100644 --- a/berkeley-function-call-leaderboard/model_handler/cohere_handler.py +++ b/berkeley-function-call-leaderboard/model_handler/cohere_handler.py @@ -23,7 +23,7 @@ class CohereHandler(BaseHandler): client: cohere.Client - def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: + def __init__(self, model_name, temperature=0.001, top_p=1, max_tokens=1000) -> None: super().__init__(model_name, temperature, top_p, max_tokens) self.model_style = ModelStyle.COHERE diff --git a/berkeley-function-call-leaderboard/model_handler/databricks_handler.py b/berkeley-function-call-leaderboard/model_handler/databricks_handler.py index 53474ca911..9921ca6afe 100644 --- a/berkeley-function-call-leaderboard/model_handler/databricks_handler.py +++ b/berkeley-function-call-leaderboard/model_handler/databricks_handler.py @@ -12,12 +12,9 @@ class DatabricksHandler(BaseHandler): - def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: - self.model_name = model_name + def __init__(self, model_name, temperature=0.001, top_p=1, max_tokens=1000) -> None: + super().__init__(model_name, temperature, top_p, max_tokens) self.model_style = ModelStyle.OpenAI - self.temperature = temperature - self.top_p = top_p - self.max_tokens = max_tokens # NOTE: To run the Databricks model, you need to provide your own Databricks API key and your own Azure endpoint URL. self.client = OpenAI( diff --git a/berkeley-function-call-leaderboard/model_handler/deepseek_handler.py b/berkeley-function-call-leaderboard/model_handler/deepseek_handler.py index 236c232733..2522d6e53d 100644 --- a/berkeley-function-call-leaderboard/model_handler/deepseek_handler.py +++ b/berkeley-function-call-leaderboard/model_handler/deepseek_handler.py @@ -4,7 +4,7 @@ class DeepseekHandler(OSSHandler): - def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: + def __init__(self, model_name, temperature=0.001, top_p=1, max_tokens=1000) -> None: super().__init__(model_name, temperature, top_p, max_tokens) def _format_prompt(prompt, function, test_category): diff --git a/berkeley-function-call-leaderboard/model_handler/firework_ai_handler.py b/berkeley-function-call-leaderboard/model_handler/firework_ai_handler.py index ac2aa7062c..c24f2fda93 100644 --- a/berkeley-function-call-leaderboard/model_handler/firework_ai_handler.py +++ b/berkeley-function-call-leaderboard/model_handler/firework_ai_handler.py @@ -10,28 +10,15 @@ class FireworkAIHandler(OpenAIHandler): - def __init__(self, model_name, temperature=0.0, top_p=1, max_tokens=1000) -> None: + def __init__(self, model_name, temperature=0.001, top_p=1, max_tokens=1000) -> None: super().__init__(model_name, temperature, top_p, max_tokens) self.model_style = ModelStyle.FIREWORK_AI - self.temperature = 0.0 self.client = OpenAI( base_url="https://api.fireworks.ai/inference/v1", api_key=os.getenv("FIRE_WORKS_API_KEY"), ) - def write(self, result, file_to_open): - # This method is used to write the result to the file. - if not os.path.exists("./result"): - os.mkdir("./result") - if not os.path.exists(f"./result/{self.model_name}"): - os.mkdir(f"./result/{self.model_name}") - with open( - f"./result/{self.model_name}/" - + file_to_open.replace(".json", "_result.json"), - "a+", - ) as f: - f.write(json.dumps(result) + "\n") def inference(self, prompt, functions, test_category): functions = language_specific_pre_processing(functions, test_category) diff --git a/berkeley-function-call-leaderboard/model_handler/functionary_handler.py b/berkeley-function-call-leaderboard/model_handler/functionary_handler.py index f7abbbe62b..0fa8e8375a 100644 --- a/berkeley-function-call-leaderboard/model_handler/functionary_handler.py +++ b/berkeley-function-call-leaderboard/model_handler/functionary_handler.py @@ -5,11 +5,8 @@ # For setup instructions, please refer to https://github.com/MeetKai/functionary for setup details. class FunctionaryHandler(OpenAIHandler): - def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: - self.temperature = temperature - self.top_p = top_p - self.max_tokens = max_tokens - self.model_name = model_name + def __init__(self, model_name, temperature=0.001, top_p=1, max_tokens=1000) -> None: + super().__init__(model_name, temperature, top_p, max_tokens) self.model_style = ModelStyle.OpenAI self.client = OpenAI(base_url="http://localhost:8000/v1", api_key="functionary") diff --git a/berkeley-function-call-leaderboard/model_handler/gemini_handler.py b/berkeley-function-call-leaderboard/model_handler/gemini_handler.py index 80763fee19..e4e6cc1967 100644 --- a/berkeley-function-call-leaderboard/model_handler/gemini_handler.py +++ b/berkeley-function-call-leaderboard/model_handler/gemini_handler.py @@ -11,7 +11,7 @@ class GeminiHandler(BaseHandler): - def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: + def __init__(self, model_name, temperature=0.001, top_p=1, max_tokens=1000) -> None: super().__init__(model_name, temperature, top_p, max_tokens) self.model_style = ModelStyle.Google diff --git a/berkeley-function-call-leaderboard/model_handler/gemma_handler.py b/berkeley-function-call-leaderboard/model_handler/gemma_handler.py index d315801e4e..d6e29af6b2 100644 --- a/berkeley-function-call-leaderboard/model_handler/gemma_handler.py +++ b/berkeley-function-call-leaderboard/model_handler/gemma_handler.py @@ -4,7 +4,7 @@ class GemmaHandler(OSSHandler): - def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: + def __init__(self, model_name, temperature=0.001, top_p=1, max_tokens=1000) -> None: super().__init__(model_name, temperature, top_p, max_tokens) def _format_prompt(prompt, function, test_category): diff --git a/berkeley-function-call-leaderboard/model_handler/glaive_handler.py b/berkeley-function-call-leaderboard/model_handler/glaive_handler.py index 1b68bf7669..cc93bcc2fe 100644 --- a/berkeley-function-call-leaderboard/model_handler/glaive_handler.py +++ b/berkeley-function-call-leaderboard/model_handler/glaive_handler.py @@ -4,7 +4,7 @@ class GlaiveHandler(OSSHandler): - def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: + def __init__(self, model_name, temperature=0.001, top_p=1, max_tokens=1000) -> None: super().__init__(model_name, temperature, top_p, max_tokens) def _format_prompt(prompt, function, test_category): diff --git a/berkeley-function-call-leaderboard/model_handler/glm_handler.py b/berkeley-function-call-leaderboard/model_handler/glm_handler.py index c0475ce295..c9d79ecb1a 100644 --- a/berkeley-function-call-leaderboard/model_handler/glm_handler.py +++ b/berkeley-function-call-leaderboard/model_handler/glm_handler.py @@ -16,7 +16,7 @@ class GLMHandler(OSSHandler): - def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: + def __init__(self, model_name, temperature=0.001, top_p=1, max_tokens=1000) -> None: super().__init__(model_name, temperature, top_p, max_tokens) self.max_model_len = 4096 self.stop_token_ids = [151329, 151336, 151338] diff --git a/berkeley-function-call-leaderboard/model_handler/gorilla_handler.py b/berkeley-function-call-leaderboard/model_handler/gorilla_handler.py index 91a640c0a5..f3f64cfb91 100644 --- a/berkeley-function-call-leaderboard/model_handler/gorilla_handler.py +++ b/berkeley-function-call-leaderboard/model_handler/gorilla_handler.py @@ -9,7 +9,7 @@ class GorillaHandler(BaseHandler): - def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: + def __init__(self, model_name, temperature=0.001, top_p=1, max_tokens=1000) -> None: super().__init__(model_name, temperature, top_p, max_tokens) self.model_style = ModelStyle.Gorilla diff --git a/berkeley-function-call-leaderboard/model_handler/gpt_handler.py b/berkeley-function-call-leaderboard/model_handler/gpt_handler.py index 6449c948a2..105c87150c 100644 --- a/berkeley-function-call-leaderboard/model_handler/gpt_handler.py +++ b/berkeley-function-call-leaderboard/model_handler/gpt_handler.py @@ -18,7 +18,7 @@ class OpenAIHandler(BaseHandler): - def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: + def __init__(self, model_name, temperature=0.001, top_p=1, max_tokens=1000) -> None: super().__init__(model_name, temperature, top_p, max_tokens) self.model_style = ModelStyle.OpenAI self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) diff --git a/berkeley-function-call-leaderboard/model_handler/granite_handler.py b/berkeley-function-call-leaderboard/model_handler/granite_handler.py index b81780052a..21499f8288 100644 --- a/berkeley-function-call-leaderboard/model_handler/granite_handler.py +++ b/berkeley-function-call-leaderboard/model_handler/granite_handler.py @@ -11,8 +11,7 @@ class GraniteHandler(OSSHandler): - def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: - temperature = 0.001 + def __init__(self, model_name, temperature=0.001, top_p=1, max_tokens=1000) -> None: super().__init__(model_name, temperature, top_p, max_tokens) def _format_prompt(prompt, function, test_category): diff --git a/berkeley-function-call-leaderboard/model_handler/handler.py b/berkeley-function-call-leaderboard/model_handler/handler.py index a4af8cd2ff..0f6832476d 100644 --- a/berkeley-function-call-leaderboard/model_handler/handler.py +++ b/berkeley-function-call-leaderboard/model_handler/handler.py @@ -6,7 +6,7 @@ class BaseHandler: model_name: str model_style: ModelStyle - def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: + def __init__(self, model_name, temperature=0.001, top_p=1, max_tokens=1000) -> None: self.model_name = model_name self.temperature = temperature self.top_p = top_p diff --git a/berkeley-function-call-leaderboard/model_handler/hermes_handler.py b/berkeley-function-call-leaderboard/model_handler/hermes_handler.py index df00afa383..56bbd6a489 100644 --- a/berkeley-function-call-leaderboard/model_handler/hermes_handler.py +++ b/berkeley-function-call-leaderboard/model_handler/hermes_handler.py @@ -6,7 +6,7 @@ class HermesHandler(OSSHandler): - def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: + def __init__(self, model_name, temperature=0.001, top_p=1, max_tokens=1000) -> None: super().__init__(model_name, temperature, top_p, max_tokens) def _format_prompt(prompt, function, test_category): diff --git a/berkeley-function-call-leaderboard/model_handler/llama_handler.py b/berkeley-function-call-leaderboard/model_handler/llama_handler.py index 06abac9b60..a5e1c14fa4 100644 --- a/berkeley-function-call-leaderboard/model_handler/llama_handler.py +++ b/berkeley-function-call-leaderboard/model_handler/llama_handler.py @@ -7,7 +7,7 @@ class LlamaHandler(OSSHandler): - def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: + def __init__(self, model_name, temperature=0.001, top_p=1, max_tokens=1000) -> None: super().__init__(model_name, temperature, top_p, max_tokens) def _format_prompt(prompt, function, test_category): diff --git a/berkeley-function-call-leaderboard/model_handler/mistral_handler.py b/berkeley-function-call-leaderboard/model_handler/mistral_handler.py index f6ce5bfb6e..cf92d3bb74 100644 --- a/berkeley-function-call-leaderboard/model_handler/mistral_handler.py +++ b/berkeley-function-call-leaderboard/model_handler/mistral_handler.py @@ -18,7 +18,7 @@ class MistralHandler(BaseHandler): - def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: + def __init__(self, model_name, temperature=0.001, top_p=1, max_tokens=1000) -> None: super().__init__(model_name, temperature, top_p, max_tokens) self.model_style = ModelStyle.Mistral diff --git a/berkeley-function-call-leaderboard/model_handler/nexus_handler.py b/berkeley-function-call-leaderboard/model_handler/nexus_handler.py index 18b538c233..483b1b9d1d 100644 --- a/berkeley-function-call-leaderboard/model_handler/nexus_handler.py +++ b/berkeley-function-call-leaderboard/model_handler/nexus_handler.py @@ -10,7 +10,6 @@ class NexusHandler(BaseHandler): def __init__(self, model_name, temperature=0.001, top_p=1, max_tokens=1000) -> None: - temperature = 0.001 super().__init__(model_name, temperature, top_p, max_tokens) self.model_style = ModelStyle.NEXUS diff --git a/berkeley-function-call-leaderboard/model_handler/nvidia_handler.py b/berkeley-function-call-leaderboard/model_handler/nvidia_handler.py index 80884de173..e0d8077da3 100644 --- a/berkeley-function-call-leaderboard/model_handler/nvidia_handler.py +++ b/berkeley-function-call-leaderboard/model_handler/nvidia_handler.py @@ -13,16 +13,14 @@ ) class NvidiaHandler(BaseHandler): - def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None: - self.model_name = model_name - self.temperature = temperature - self.top_p = top_p - self.max_tokens = max_tokens + def __init__(self, model_name, temperature=0.001, top_p=1, max_tokens=1000) -> None: + super().__init__(model_name, temperature, top_p, max_tokens) self.model_style = ModelStyle.OpenAI self.client = OpenAI( base_url = "https://integrate.api.nvidia.com/v1", api_key = os.getenv("NVIDIA_API_KEY") ) + def inference(self, prompt, functions, test_category): prompt = augment_prompt_by_languge(prompt,test_category) functions = language_specific_pre_processing(functions,test_category) diff --git a/berkeley-function-call-leaderboard/model_handler/oss_handler.py b/berkeley-function-call-leaderboard/model_handler/oss_handler.py index 1859b32511..1fd3108f18 100644 --- a/berkeley-function-call-leaderboard/model_handler/oss_handler.py +++ b/berkeley-function-call-leaderboard/model_handler/oss_handler.py @@ -11,7 +11,7 @@ class OSSHandler(BaseHandler): - def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000, dtype="float16") -> None: + def __init__(self, model_name, temperature=0.001, top_p=1, max_tokens=1000, dtype="float16") -> None: super().__init__(model_name, temperature, top_p, max_tokens) self.model_style = ModelStyle.OSSMODEL self.dtype = dtype diff --git a/berkeley-function-call-leaderboard/model_handler/xlam_handler.py b/berkeley-function-call-leaderboard/model_handler/xlam_handler.py index 9d0ab1e9fc..fba90cb5d5 100644 --- a/berkeley-function-call-leaderboard/model_handler/xlam_handler.py +++ b/berkeley-function-call-leaderboard/model_handler/xlam_handler.py @@ -31,7 +31,6 @@ class xLAMHandler(OSSHandler): def __init__(self, model_name, temperature=0.001, top_p=1, max_tokens=512, dtype="bfloat16") -> None: - temperature = 0.001 super().__init__(model_name, temperature, top_p, max_tokens, dtype) self.model_style = ModelStyle.OSSMODEL diff --git a/berkeley-function-call-leaderboard/model_handler/yi_handler.py b/berkeley-function-call-leaderboard/model_handler/yi_handler.py index 8e8967cf6f..d5d1647a47 100644 --- a/berkeley-function-call-leaderboard/model_handler/yi_handler.py +++ b/berkeley-function-call-leaderboard/model_handler/yi_handler.py @@ -12,7 +12,7 @@ class YiHandler(BaseHandler): - def __init__(self, model_name, temperature=0.0, top_p=1, max_tokens=1000) -> None: + def __init__(self, model_name, temperature=0.001, top_p=1, max_tokens=1000) -> None: super().__init__(model_name, temperature, top_p, max_tokens) self.model_style = ModelStyle.OpenAI self.base_url = "https://api.01.ai/v1" diff --git a/berkeley-function-call-leaderboard/openfunctions_evaluation.py b/berkeley-function-call-leaderboard/openfunctions_evaluation.py index e3a08698cf..4e08a4a307 100644 --- a/berkeley-function-call-leaderboard/openfunctions_evaluation.py +++ b/berkeley-function-call-leaderboard/openfunctions_evaluation.py @@ -14,7 +14,7 @@ def get_args(): parser.add_argument("--test-category", type=str, default="all", nargs="+") # Parameters for the model that you want to test. - parser.add_argument("--temperature", type=float, default=0.7) + parser.add_argument("--temperature", type=float, default=0.001) parser.add_argument("--top-p", type=float, default=1) parser.add_argument("--max-tokens", type=int, default=1200) parser.add_argument("--num-gpus", default=1, type=int)