BFCL April 27th Release (Bug Fix in Cost/Latency Calculation) (Shishi…

…rPatil#390) In this PR, we fix some inconsistency issues in the cost and latency calculation for open-source models, which are now all calculated when serving the model with [vLLM](https://github.com/vllm-project/vllm) using 8 V100 GPUs. $$\text{Cost} = \text{Latency per 1000 function call} * (\text{8xV100 azure-pay-as-you-go-price per hour / 3600})$$ This PR **DOES** change the leaderboard value in the `cost` and `latency` columns; but it **DOES NOT** change the accuracy score. We will update the leaderboard in a different PR ShishirPatil#391. We want to thank the community for pointing out this oversight. Thanks [@abacaj](https://twitter.com/abacaj) and [@teknium1](https://twitter.com/Teknium1) for initially raising the issue, and thanks [@natikgadzhi](https://twitter.com/natikgadzhi) [@HamelHusain](https://twitter.com/HamelHusain) [@nicoritschel](https://twitter.com/nicoritschel) [@winglian](https://twitter.com/winglian) [@olafgeibig](https://twitter.com/olafgeibig) and many others for joining the conversation. We are listening to community feedback and continuously improving our Berkeley Function Calling Leaderboard. Discussions like [this](https://twitter.com/abacaj/status/1784003306508980250) serve as great examples. Let us know what you want us to include next! --------- Co-authored-by: Charlie Cheng-Jie Ji <[email protected]> Co-authored-by: Fanjia Yan <[email protected]>
vinaybagade · Apr 27, 2024 · e90bebf · e90bebf
1 parent 0d632da
commit e90bebf
Show file tree

Hide file tree

Showing 2 changed files with 21 additions and 32 deletions.
diff --git a/berkeley-function-call-leaderboard/README.md b/berkeley-function-call-leaderboard/README.md
@@ -230,6 +230,7 @@ For inferencing `Databrick-DBRX-instruct`, you need to create a Databrick Azure
 
 ## Changelog
 
+* [April 27, 2024] [#390](https://github.com/ShishirPatil/gorilla/pull/390): Bug fix in cost and latency calculation for open-source models, which are now all calculated when serving the model with [vLLM](https://github.com/vllm-project/vllm) using 8 V100 GPUs for consistency. $$\text{Cost} = \text{Latency per 1000 function call} * (\text{8xV100 azure-pay-as-you-go-price per hour / 3600})$$
 * [April 25, 2024] [#386](https://github.com/ShishirPatil/gorilla/pull/386): Add 5 new models to the leaderboard: `meta-llama/Meta-Llama-3-8B-Instruct`, `meta-llama/Meta-Llama-3-70B-Instruct`, `gemini-1.5-pro-preview-0409`, `command-r-plus`, `command-r-plus-FC`.
 * [April 19, 2024] [#377](https://github.com/ShishirPatil/gorilla/pull/377): 
     - Bug fix for the evaluation dataset in the executable test categories. This includes updates to both prompts and function docs. 

diff --git a/berkeley-function-call-leaderboard/eval_checker/eval_runner_helper.py b/berkeley-function-call-leaderboard/eval_checker/eval_runner_helper.py
@@ -284,13 +284,13 @@
         "Google",
         "Proprietary",
     ],
-    "meta-llama_Meta-Llama-3-8B-Instruct":[
+    "meta-llama_Meta-Llama-3-8B-Instruct": [
         "Meta-Llama-3-8B-Instruct (Prompt)",
         "https://llama.meta.com/llama3",
         "Meta",
         "Meta Llama 3 Community",
     ],
-    "meta-llama_Meta-Llama-3-70B-Instruct":[
+    "meta-llama_Meta-Llama-3-70B-Instruct": [
         "Meta-Llama-3-70B-Instruct (Prompt)",
         "https://llama.meta.com/llama3",
         "Meta",
@@ -336,6 +336,7 @@
     "mistral-medium-2312": 2.7,
     "mistral-small-2402-FC-Any": 2,
     "mistral-small-2402-FC-Auto": 2,
+    "mistral-small-2402": 2,
     "mistral-tiny-2312": 0.25,
     "gpt-4-1106-preview-FC": 10,
     "gpt-4-1106-preview": 10,
@@ -348,14 +349,12 @@
     "gpt-3.5-turbo-0125": 1.5,
     "gpt-3.5-turbo-0125-FC": 1.5,
     "gemini-1.0-pro": 1,
-    "gemini-1.5-pro": 7,
+    "gemini-1.5-pro-preview-0409": 7,
     "databricks-dbrx-instruct": 2.25,
     "command-r-plus-FC": 3,
     "command-r-plus": 3,
     "command-r-plus-FC-optimized": 3,
     "command-r-plus-optimized": 3,
-    "meta-llama/Meta-Llama-3-8B-Instruct": 0.15,
-    "meta-llama/Meta-Llama-3-70B-Instruct": 1,
 }
 
 OUTPUT_PRICE_PER_MILLION_TOKEN = {
@@ -369,6 +368,7 @@
     "claude-instant-1.2": 5.51,
     "mistral-large-2402-FC-Any": 24,
     "mistral-large-2402-FC-Auto": 24,
+    "mistral-small-2402": 24,
     "mistral-medium-2312": 8.1,
     "mistral-small-2402-FC-Any": 6,
     "mistral-small-2402-FC-Auto": 6,
@@ -384,50 +384,38 @@
     "gpt-3.5-turbo-0125": 2,
     "gpt-3.5-turbo-0125-FC": 2,
     "gemini-1.0-pro": 2,
-    "gemini-1.5-pro": 14,
+    "gemini-1.5-pro-preview-0409": 14,
     "databricks-dbrx-instruct": 6.75,
     "command-r-plus-FC": 15,
     "command-r-plus": 15,
     "command-r-plus-FC-optimized": 15,
     "command-r-plus-optimized": 15,
-    "meta-llama/Meta-Llama-3-8B-Instruct": 0.15,
-    "meta-llama/Meta-Llama-3-70B-Instruct": 1,
 }
 
 # The latency of the open-source models are hardcoded here.
 # Because we do batching when generating the data, so the latency is not accurate from the result data.
-# This is the latency for the whole batch of data.
+# This is the latency for the whole batch of data, when using 8 V100 GPUs.
 OSS_LATENCY = {
-    "deepseek-ai/deepseek-coder-6.7b-instruct": 2040,
-    "google/gemma-7b-it": 161,
-    "glaiveai/glaive-function-calling-v1": 99,
-    "NousResearch/Hermes-2-Pro-Mistral-7B": 666,
+    "deepseek-ai/deepseek-coder-6.7b-instruct": 909,
+    "google/gemma-7b-it": 95,
+    "NousResearch/Hermes-2-Pro-Mistral-7B": 135,
     "meta-llama/Meta-Llama-3-8B-Instruct": 73,
-    "meta-llama/Meta-Llama-3-70B-Instruct": 304,
-}
-
-OSS_INPUT_TOKEN = {
-    "deepseek-ai/deepseek-coder-6.7b-instruct": 884190,
-    "google/gemma-7b-it": 733701,
-}
-
-OSS_OUTPUT_TOKEN = {
-    "deepseek-ai/deepseek-coder-6.7b-instruct": 2009421,
-    "google/gemma-7b-it": 130206,
+    "meta-llama/Meta-Llama-3-70B-Instruct": 307,
+    "gorilla-openfunctions-v2": 83,
 }
 
 
 NO_COST_MODELS = [
     "Nexusflow-Raven-v2",
     "fire-function-v1-FC",
-    "meetkai_functionary-medium-v2.4-FC",
-    "meetkai_functionary-small-v2.2-FC",
-    "meetkai_functionary-small-v2.4-FC",
+    "meetkai/functionary-medium-v2.4-FC",
+    "meetkai/functionary-small-v2.2-FC",
+    "meetkai/functionary-small-v2.4-FC",
 ]
 
-A100_PRICE_PER_HOUR = (
-    10.879 / 8
-)  # Price got from AZure, 10.879 per hour for 8 A100, 3 years reserved
+# Price got from AZure, 22.032 per hour for 8 V100, Pay As You Go Total Price
+# Reference: https://azure.microsoft.com/en-us/pricing/details/machine-learning/
+V100_x8_PRICE_PER_HOUR = 22.032
 
 
 def extract_after_test(input_string):
@@ -720,7 +708,7 @@ def get_metric(model_name, cost_data, latency_data):
             "N/A",
         )
         mean_latency = round(mean_latency, 2)
-        cost = mean_latency * 1000 * A100_PRICE_PER_HOUR / 3600
+        cost = mean_latency * 1000 * V100_x8_PRICE_PER_HOUR / 3600
         cost = round(cost, 2)
 
     elif len(latency_data["data"]) != 0:
@@ -732,7 +720,7 @@ def get_metric(model_name, cost_data, latency_data):
         percentile_95_latency = round(percentile_95_latency, 2)
 
         if model_name not in INPUT_PRICE_PER_MILLION_TOKEN:
-            cost = sum(latency_data["data"]) * A100_PRICE_PER_HOUR / 3600
+            cost = sum(latency_data["data"]) * V100_x8_PRICE_PER_HOUR / 3600
             cost = round(cost, 2)
 
     if model_name in NO_COST_MODELS: