Skip to content

Commit

Permalink
Adds wandb to eval files (ShishirPatil#114)
Browse files Browse the repository at this point in the history
Add Weights & Biases logging to:
- log the llm responses in a file and in a W&B Table to explore
- Keep track of progress of llm responses progress (helpful during long
llm response queries)
- log the ast evaluation accuracy of the logged responses
  • Loading branch information
morganmcg1 authored Aug 24, 2023
1 parent 41b0559 commit bcd2ce1
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 1 deletion.
13 changes: 13 additions & 0 deletions eval/eval-scripts/ast_eval_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,15 @@ def main(args):
else:
pass

if use_wandb:
if args.wandb_run_id is not None:
wandb.init(project=args.wandb_project, entity=args.wandb_entity, id=args.wandb_run_id, resume="must")
else:
wandb.init(project=args.wandb_project, entity=args.wandb_entity)

wandb.summary['final_functionality_accuracy': total_correct / len(llm_responses)]
wandb.summary['final_hallucination': total_hallucination/len(llm_responses)]

print('Final Functionality accuracy: ', total_correct / len(llm_responses))
print('Final hallucination: ', total_hallucination/len(llm_responses))

Expand All @@ -169,5 +178,9 @@ def main(args):
parser.add_argument("--api_dataset", type=str, default=None, help="path to your api dataset")
parser.add_argument("--apibench", type=str, default=None, help="path to your apibench dataset including the question and answer pairs")
parser.add_argument("--llm_responses", type=str, default=None, help="path to the language model responses")
parser.add_argument("--use_wandb", action='store_true', help="pass this argument to turn on Weights & Biases logging of the LLM responses")
parser.add_argument("--wandb_project", type=str, default="gorilla-api", help="Weights & Biases project name")
parser.add_argument("--wandb_entity", type=str, default=None, help="Weights & Biases entity name")
parser.add_argument("--wandb_run_id", type=str, default=None, help="pass W&B run id to append results to that run, otherwise a new W&B run is logged")
args = parser.parse_args()
main(args)
48 changes: 47 additions & 1 deletion eval/get_llm_responses.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
import anthropic
import multiprocessing as mp
import time
import wandb
from tenacity import retry, wait_exponential

def encode_question(question, api_name):
"""Encode multiple prompt instructions into a single string."""
Expand Down Expand Up @@ -47,6 +49,7 @@ def encode_question(question, api_name):
prompts.append({"role": "user", "content": prompt})
return prompts

@retry(wait=wait_exponential(multiplier=1, min=10, max=120), reraise=True)
def get_response(get_response_input, api_key):
question, question_id, api_name, model = get_response_input
question = encode_question(question, api_name)
Expand Down Expand Up @@ -82,6 +85,7 @@ def get_response(get_response_input, api_key):
def process_entry(entry, api_key):
question, question_id, api_name, model = entry
result = get_response((question, question_id, api_name, model), api_key)
wandb.log({"question_id_completed":question_id})
return result

def write_result_to_file(result, output_file):
Expand All @@ -102,8 +106,23 @@ def callback_with_lock(result, output_file):
parser.add_argument("--output_file", type=str, default=None, help="the output file this script writes to")
parser.add_argument("--question_data", type=str, default=None, help="path to the questions data file")
parser.add_argument("--api_name", type=str, default=None, help="this will be the api dataset name you are testing, only support ['torchhub', 'tensorhun', 'huggingface'] now")
parser.add_argument("--use_wandb", action='store_true', help="pass this argument to turn on Weights & Biases logging of the LLM responses")
parser.add_argument("--wandb_project", type=str, default="gorilla-api", help="Weights & Biases project name")
parser.add_argument("--wandb_entity", type=str, default=None, help="Weights & Biases entity name")
args = parser.parse_args()

if args.use_wandb:
wandb.init(
project=args.wandb_project,
entity=args.wandb_entity,
config={
"api_name":args.api_name,
"model":args.model,
"question_data":args.question_data,
"output_file": args.output_file
}
)

start_time = time.time()
# Read the question file
questions = []
Expand All @@ -127,4 +146,31 @@ def callback_with_lock(result, output_file):
pool.join()

end_time = time.time()
print("Total time used: ", end_time - start_time)
elapsed_time = end_time - start_time
print("Total time used: ", elapsed_time)

if args.use_wandb:
print("\nSaving all responses to Weights & Biases...\n")

wandb.summary["elapsed_time_s"] = elapsed_time

line_count = 0
with open(args.output_file, 'r') as file:
for i,line in enumerate(file):
data = json.loads(line.strip())

if i == 0:
tbl = wandb.Table(columns=list(data.keys()))
tbl.add_data(*list(data.values()))
line_count+=1

# Log the Tale to W&B
wandb.log({"llm_eval_responses": tbl})
wandb.summary["response_count"] = line_count

# Also log results file as W&B Artifact
wandb.log_artifact(args.output_file,
name=f"{args.api_name}-{args._model}-eval-results",
type=f"eval-results",
aliases=[f"{line_count}-responses"]
)

0 comments on commit bcd2ce1

Please sign in to comment.