[BFCL] Add the option to log to WandB during bfcl evaluate (ShishirPa…

…til#736) In our internal use of BFCL at Valence Labs, we use WandB extensively to centralize the results of our benchmarking. This PR adds a `--wandb-project` CLI argument to the `bfcl evaluate` command to upload the generated `.csv` files to wandb. I wanted to open this PR to see if this would be an feature of interest for BFCL :) Here is an example command: ```bash bfcl evaluate --model gpt-3.5-turbo-0125 --wandb-project <wandb_entity:wandb_project> ``` This will log the `data_live.csv`, `data_non_live.csv` and `data_overall.csv` as dataframe artifacts on the WandB `wandb_project` under the `wandb_entity`. --------- Co-authored-by: Huanzhi (Hans) Mao <[email protected]>
samjulien · Nov 19, 2024 · 0fc2bd2 · 0fc2bd2
1 parent 5df6372
commit 0fc2bd2
Show file tree

Hide file tree

Showing 7 changed files with 92 additions and 13 deletions.
diff --git a/.gitignore b/.gitignore
@@ -34,3 +34,6 @@ berkeley-function-call-leaderboard/utils/ground_truth_conversation/
 
 .direnv/
 .venv
+
+# Ignore the wandb cache:
+**/wandb/
diff --git a/berkeley-function-call-leaderboard/.env.example b/berkeley-function-call-leaderboard/.env.example
@@ -23,3 +23,6 @@ RAPID_API_KEY=
 EXCHANGERATE_API_KEY=
 OMDB_API_KEY=
 GEOCODE_API_KEY=
+
+# [OPTIONAL] Required for WandB to log the generated .csv in the format 'entity:project
+WANDB_BFCL_PROJECT=ENTITY:PROJECT
diff --git a/berkeley-function-call-leaderboard/CHANGELOG.md b/berkeley-function-call-leaderboard/CHANGELOG.md
@@ -2,6 +2,7 @@
 
 All notable changes to the Berkeley Function Calling Leaderboard will be documented in this file.
 
+- [Nov 18, 2024] [#736](https://github.com/ShishirPatil/gorilla/pull/736): Add the option to additionally log the evaluation results to [WandB](https://github.com/wandb/wandb) artifacts. User can enable this feature by providing the entity and project name in `WANDB_BFCL_PROJECT` in the `.env` file.
 - [Nov 18, 2024] [#768](https://github.com/ShishirPatil/gorilla/pull/768), [#770](https://github.com/ShishirPatil/gorilla/pull/770): Resolve issues in Gemini models (FC mode) related to handling scenarios with no tools available and cases where the model output is empty.
 - [Nov 17, 2024] [#767](https://github.com/ShishirPatil/gorilla/pull/767): Fix price and latency calculation. A merge conflict results in a duplicate line, and counting the input and output token for each entry multiple times.
 - [Nov 15, 2024] [#762](https://github.com/ShishirPatil/gorilla/pull/762): Supply `data_multi_turn.csv` for multi-turn evaluation results

diff --git a/berkeley-function-call-leaderboard/README.md b/berkeley-function-call-leaderboard/README.md
@@ -312,6 +312,20 @@ If you want to run `live_simple` and `javascript` tests for a few models and `go
 bfcl evaluate --model gorilla-openfunctions-v2 claude-3-5-sonnet-20240620 gpt-4-0125-preview gemini-1.5-pro-preview-0514 --test-category live_simple javascript
 ```
 
+#### WandB Evaluation Logging
+
+If you want to additionally log the evaluation results as WandB artifacts to a specific WandB entity and project, you can install wandb as an optional dependency:
+
+```bash
+pip install -e.[wandb]
+```
+
+And you can specify the entity and project name you want to log to on Wandb using the `WANDB_BFCL_PROJECT` environment variable in the `.env` file in the following format:
+
+```bash
+WANDB_BFCL_PROJECT=ENTITY:PROJECT
+```
+
 ### Model-Specific Optimization
 
 Some companies have proposed some optimization strategies in their models' handler, which we (BFCL) think is unfair to other models, as those optimizations are not generalizable to all models. Therefore, we have disabled those optimizations during the evaluation process by default. You can enable those optimizations by setting the `USE_{COMPANY}_OPTIMIZATION` flag to `True` in the `.env` file.

diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py
@@ -661,4 +661,4 @@ def get_handler(model_name):
 
     load_dotenv(dotenv_path=DOTENV_PATH, verbose=True, override=True)  # Load the .env file
 
-    main(args.model, args.test_category, args.api_sanity_check)
+    main(args.model, args.test_category, args.api_sanity_check)
diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py
@@ -1,8 +1,10 @@
 import os
 import statistics
+from datetime import datetime
 from pathlib import Path
 
 import numpy as np
+import pandas as pd
 from bfcl._apply_function_credential_config import apply_function_credential_config
 from bfcl.eval_checker.constant import *
 from bfcl.eval_checker.executable_eval.custom_exception import BadAPIStatusError
@@ -239,6 +241,7 @@ def get_cost_letency_info(model_name, cost_data, latency_data):
 
     return cost, mean_latency, std_latency, percentile_95_latency
 
+
 # TODO: Refactor this function to reduce code duplication
 def generate_leaderboard_csv(
     leaderboard_table, output_path, eval_models=None, eval_categories=None
@@ -250,13 +253,13 @@ def generate_leaderboard_csv(
     data_combined = []
     for model_name, value in leaderboard_table.items():
         model_name_escaped = model_name.replace("_", "/")
-        
+
         cost_data = value.get("cost", {"input_data": [], "output_data": []})
         latency_data = value.get("latency", {"data": []})
         cost, latency_mean, latency_std, percentile_95_latency = get_cost_letency_info(
             model_name_escaped, cost_data, latency_data
         )
-        
+
         # Non-Live Score
         python_simple_ast_non_live = value.get("simple", {"accuracy": 0, "total_count": 0})
         python_multiple_ast_non_live = value.get(
@@ -343,7 +346,7 @@ def generate_leaderboard_csv(
                 irrelevance_non_live["accuracy"],
             ]
         )
-        
+
         # Live Score
         python_simple_ast_live = value.get(
             "live_simple", {"accuracy": 0, "total_count": 0}
@@ -380,7 +383,7 @@ def generate_leaderboard_csv(
                 relevance_live,
             ]
         )
-        
+
         data_live.append(
             [
                 "N/A",
@@ -395,7 +398,7 @@ def generate_leaderboard_csv(
                 relevance_live["accuracy"],
             ]
         )
-        
+
         # Multi-Turn Score
         multi_turn_base = value.get("multi_turn_base", {"accuracy": 0, "total_count": 0})
         multi_turn_miss_func = value.get(
@@ -427,12 +430,12 @@ def generate_leaderboard_csv(
                 multi_turn_long_context["accuracy"],
             ]
         )
-        
+
         # Total Score
         single_turn_ast = calculate_unweighted_accuracy([overall_accuracy_live, overall_accuracy_non_live])
         total_irrelevance = calculate_unweighted_accuracy([irrelevance_non_live, irrelevance_live])
         total_relevance = relevance_live
-        
+
         total_overall_accuracy = calculate_unweighted_accuracy(
             [
                 overall_accuracy_live,
@@ -477,7 +480,7 @@ def generate_leaderboard_csv(
                 MODEL_METADATA_MAPPING[model_name_escaped][3],
             ]
         )
-        
+
     # Write Non-Live Score File
     data_non_live.sort(key=lambda x: x[2], reverse=True)
     for i in range(len(data_non_live)):
@@ -494,7 +497,7 @@ def generate_leaderboard_csv(
                 f.write(",".join(row) + "\n")
             else:
                 f.write(",".join(row))
-    
+
     # Write Live Score File
     data_live.sort(key=lambda x: x[2], reverse=True)
     for i in range(len(data_live)):
@@ -559,6 +562,60 @@ def generate_leaderboard_csv(
     #         category_status, eval_models=eval_models, eval_categories=eval_categories
     #     )
 
+    wandb_project = os.getenv("WANDB_BFCL_PROJECT")
+    if wandb_project and wandb_project != "ENTITY:PROJECT":
+        import wandb
+
+        # Initialize WandB run
+        wandb.init(
+            # wandb_project is 'entity:project'
+            entity=wandb_project.split(":")[0],
+            project=wandb_project.split(":")[1],
+            name=f"BFCL-v3-{datetime.now().strftime('%Y%m%d-%H%M%S')}",
+        )
+
+        # Log CSV files to WandB
+        # Read the CSV files
+        non_live_df = pd.read_csv(output_path / "data_non_live.csv")
+        live_df = pd.read_csv(output_path / "data_live.csv")
+        multi_turn_df = pd.read_csv(output_path / "data_multi_turn.csv")
+        overall_df = pd.read_csv(output_path / "data_overall.csv")
+
+        # Convert DataFrames to WandB Tables
+        non_live_table = wandb.Table(dataframe=non_live_df)
+        live_table = wandb.Table(dataframe=live_df)
+        multi_turn_table = wandb.Table(dataframe=multi_turn_df)
+        overall_table = wandb.Table(dataframe=overall_df)
+
+        # Create artifacts
+        bfcl_artifact = wandb.Artifact("bfcl_results", type="dataset")
+
+        # Add tables to artifact
+        bfcl_artifact.add(non_live_table, "non_live_results")
+        bfcl_artifact.add(live_table, "live_results")
+        bfcl_artifact.add(multi_turn_table, "multi_turn_results")
+        bfcl_artifact.add(overall_table, "overall_results")
+
+        # Add raw CSV files to artifact
+        bfcl_artifact.add_file(str(output_path / "data_non_live.csv"))
+        bfcl_artifact.add_file(str(output_path / "data_live.csv"))
+        bfcl_artifact.add_file(str(output_path / "data_multi_turn.csv"))
+        bfcl_artifact.add_file(str(output_path / "data_overall.csv"))
+
+        # Log tables directly
+        wandb.log(
+            {
+                "Non-Live Results": non_live_table,
+                "Live Results": live_table,
+                "Multi-Turn Results": multi_turn_table,
+                "Overall Results": overall_table,
+            }
+        )
+
+        # Log artifact
+        wandb.log_artifact(bfcl_artifact)
+        wandb.finish()
+
 
 def check_model_category_status(score_path):
     result_path = score_path.replace("score", "result")
@@ -597,7 +654,7 @@ def check_model_category_status(score_path):
         result_subdir = os.path.join(result_path, model_name)
         if os.path.exists(result_subdir):
             for result_file in os.listdir(result_subdir):
-                if result_file.endswith('.json'):
+                if result_file.endswith(".json"):
                     test_category = extract_test_category(result_file)
                     if test_category in category_status[model_name]:
                         category_status[model_name][test_category]["generated"] = True
@@ -640,8 +697,8 @@ def check_all_category_present(category_status, eval_models=None, eval_categorie
             if first_time:
                 print(f"We are checking models: {eval_models} and categories: {eval_categories}")
                 print(f"\n{RED_FONT}{'=' * 30} Model Category Status {'=' * 30}{RESET}")
-                first_time = False       
- 
+                first_time = False
+
             print(f"{RED_FONT}Model: {model_name}{RESET}")
             if not_generated:
                 print(f"\n  Missing results for {len(not_generated)} categories:")

diff --git a/berkeley-function-call-leaderboard/pyproject.toml b/berkeley-function-call-leaderboard/pyproject.toml
@@ -45,3 +45,4 @@ Repository = "https://github.com/ShishirPatil/gorilla/tree/main/berkeley-functio
 [project.optional-dependencies]
 oss_eval_vllm = ["vllm==0.6.3"]
 oss_eval_sglang = ["sglang[all]"]
+wandb = ["wandb==0.18.5"]
Original file line number	Diff line number	Diff line change
Expand Up		@@ -661,4 +661,4 @@ def get_handler(model_name):

		load_dotenv(dotenv_path=DOTENV_PATH, verbose=True, override=True) # Load the .env file

		main(args.model, args.test_category, args.api_sanity_check)
		main(args.model, args.test_category, args.api_sanity_check)