diff --git a/berkeley-function-call-leaderboard/README.md b/berkeley-function-call-leaderboard/README.md index e5667577f..e3ee9df07 100644 --- a/berkeley-function-call-leaderboard/README.md +++ b/berkeley-function-call-leaderboard/README.md @@ -230,6 +230,7 @@ For inferencing `Databrick-DBRX-instruct`, you need to create a Databrick Azure ## Changelog +* [April 27, 2024] [#390](https://github.com/ShishirPatil/gorilla/pull/390): Bug fix in cost and latency calculation for open-source models, which are now all calculated when serving the model with [vLLM](https://github.com/vllm-project/vllm) using 8 V100 GPUs for consistency. $$\text{Cost} = \text{Latency per 1000 function call} * (\text{8xV100 azure-pay-as-you-go-price per hour / 3600})$$ * [April 25, 2024] [#386](https://github.com/ShishirPatil/gorilla/pull/386): Add 5 new models to the leaderboard: `meta-llama/Meta-Llama-3-8B-Instruct`, `meta-llama/Meta-Llama-3-70B-Instruct`, `gemini-1.5-pro-preview-0409`, `command-r-plus`, `command-r-plus-FC`. * [April 19, 2024] [#377](https://github.com/ShishirPatil/gorilla/pull/377): - Bug fix for the evaluation dataset in the executable test categories. This includes updates to both prompts and function docs. diff --git a/berkeley-function-call-leaderboard/eval_checker/eval_runner_helper.py b/berkeley-function-call-leaderboard/eval_checker/eval_runner_helper.py index aeab83076..ba8a76ca1 100644 --- a/berkeley-function-call-leaderboard/eval_checker/eval_runner_helper.py +++ b/berkeley-function-call-leaderboard/eval_checker/eval_runner_helper.py @@ -284,13 +284,13 @@ "Google", "Proprietary", ], - "meta-llama_Meta-Llama-3-8B-Instruct":[ + "meta-llama_Meta-Llama-3-8B-Instruct": [ "Meta-Llama-3-8B-Instruct (Prompt)", "https://llama.meta.com/llama3", "Meta", "Meta Llama 3 Community", ], - "meta-llama_Meta-Llama-3-70B-Instruct":[ + "meta-llama_Meta-Llama-3-70B-Instruct": [ "Meta-Llama-3-70B-Instruct (Prompt)", "https://llama.meta.com/llama3", "Meta", @@ -336,6 +336,7 @@ "mistral-medium-2312": 2.7, "mistral-small-2402-FC-Any": 2, "mistral-small-2402-FC-Auto": 2, + "mistral-small-2402": 2, "mistral-tiny-2312": 0.25, "gpt-4-1106-preview-FC": 10, "gpt-4-1106-preview": 10, @@ -348,14 +349,12 @@ "gpt-3.5-turbo-0125": 1.5, "gpt-3.5-turbo-0125-FC": 1.5, "gemini-1.0-pro": 1, - "gemini-1.5-pro": 7, + "gemini-1.5-pro-preview-0409": 7, "databricks-dbrx-instruct": 2.25, "command-r-plus-FC": 3, "command-r-plus": 3, "command-r-plus-FC-optimized": 3, "command-r-plus-optimized": 3, - "meta-llama/Meta-Llama-3-8B-Instruct": 0.15, - "meta-llama/Meta-Llama-3-70B-Instruct": 1, } OUTPUT_PRICE_PER_MILLION_TOKEN = { @@ -369,6 +368,7 @@ "claude-instant-1.2": 5.51, "mistral-large-2402-FC-Any": 24, "mistral-large-2402-FC-Auto": 24, + "mistral-small-2402": 24, "mistral-medium-2312": 8.1, "mistral-small-2402-FC-Any": 6, "mistral-small-2402-FC-Auto": 6, @@ -384,50 +384,38 @@ "gpt-3.5-turbo-0125": 2, "gpt-3.5-turbo-0125-FC": 2, "gemini-1.0-pro": 2, - "gemini-1.5-pro": 14, + "gemini-1.5-pro-preview-0409": 14, "databricks-dbrx-instruct": 6.75, "command-r-plus-FC": 15, "command-r-plus": 15, "command-r-plus-FC-optimized": 15, "command-r-plus-optimized": 15, - "meta-llama/Meta-Llama-3-8B-Instruct": 0.15, - "meta-llama/Meta-Llama-3-70B-Instruct": 1, } # The latency of the open-source models are hardcoded here. # Because we do batching when generating the data, so the latency is not accurate from the result data. -# This is the latency for the whole batch of data. +# This is the latency for the whole batch of data, when using 8 V100 GPUs. OSS_LATENCY = { - "deepseek-ai/deepseek-coder-6.7b-instruct": 2040, - "google/gemma-7b-it": 161, - "glaiveai/glaive-function-calling-v1": 99, - "NousResearch/Hermes-2-Pro-Mistral-7B": 666, + "deepseek-ai/deepseek-coder-6.7b-instruct": 909, + "google/gemma-7b-it": 95, + "NousResearch/Hermes-2-Pro-Mistral-7B": 135, "meta-llama/Meta-Llama-3-8B-Instruct": 73, - "meta-llama/Meta-Llama-3-70B-Instruct": 304, -} - -OSS_INPUT_TOKEN = { - "deepseek-ai/deepseek-coder-6.7b-instruct": 884190, - "google/gemma-7b-it": 733701, -} - -OSS_OUTPUT_TOKEN = { - "deepseek-ai/deepseek-coder-6.7b-instruct": 2009421, - "google/gemma-7b-it": 130206, + "meta-llama/Meta-Llama-3-70B-Instruct": 307, + "gorilla-openfunctions-v2": 83, } NO_COST_MODELS = [ "Nexusflow-Raven-v2", "fire-function-v1-FC", - "meetkai_functionary-medium-v2.4-FC", - "meetkai_functionary-small-v2.2-FC", - "meetkai_functionary-small-v2.4-FC", + "meetkai/functionary-medium-v2.4-FC", + "meetkai/functionary-small-v2.2-FC", + "meetkai/functionary-small-v2.4-FC", ] -A100_PRICE_PER_HOUR = ( - 10.879 / 8 -) # Price got from AZure, 10.879 per hour for 8 A100, 3 years reserved +# Price got from AZure, 22.032 per hour for 8 V100, Pay As You Go Total Price +# Reference: https://azure.microsoft.com/en-us/pricing/details/machine-learning/ +V100_x8_PRICE_PER_HOUR = 22.032 def extract_after_test(input_string): @@ -720,7 +708,7 @@ def get_metric(model_name, cost_data, latency_data): "N/A", ) mean_latency = round(mean_latency, 2) - cost = mean_latency * 1000 * A100_PRICE_PER_HOUR / 3600 + cost = mean_latency * 1000 * V100_x8_PRICE_PER_HOUR / 3600 cost = round(cost, 2) elif len(latency_data["data"]) != 0: @@ -732,7 +720,7 @@ def get_metric(model_name, cost_data, latency_data): percentile_95_latency = round(percentile_95_latency, 2) if model_name not in INPUT_PRICE_PER_MILLION_TOKEN: - cost = sum(latency_data["data"]) * A100_PRICE_PER_HOUR / 3600 + cost = sum(latency_data["data"]) * V100_x8_PRICE_PER_HOUR / 3600 cost = round(cost, 2) if model_name in NO_COST_MODELS: