Skip to content

Commit

Permalink
Fix dspy.Evaluate's handling of exceptions (from 2.5.30) (#1839)
Browse files Browse the repository at this point in the history
  • Loading branch information
okhat authored Nov 22, 2024
1 parent ee6c166 commit 44b3331
Showing 1 changed file with 20 additions and 31 deletions.
51 changes: 20 additions & 31 deletions dspy/evaluate/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ def __init__(
return_all_scores=False,
return_outputs=False,
provide_traceback=False,
failure_score=0.0,
**_kwargs,
):
self.devset = devset
Expand All @@ -65,6 +66,7 @@ def __init__(
self.return_all_scores = return_all_scores
self.return_outputs = return_outputs
self.provide_traceback = provide_traceback
self.failure_score = failure_score

def __call__(
self,
Expand All @@ -85,7 +87,6 @@ def __call__(
return_all_scores = return_all_scores if return_all_scores is not None else self.return_all_scores
return_outputs = return_outputs if return_outputs is not None else self.return_outputs

devset = list(enumerate(devset))
tqdm.tqdm._instances.clear()

executor = ParallelExecutor(
Expand All @@ -96,39 +97,27 @@ def __call__(
compare_results=True,
)

def process_item(item):
try:
example_idx, example = item
prediction = program(**example.inputs())
score = metric(example, prediction)
def process_item(example):
prediction = program(**example.inputs())
score = metric(example, prediction)

# Increment assert and suggest failures to program's attributes
if hasattr(program, "_assert_failures"):
program._assert_failures += dspy.settings.get("assert_failures")
if hasattr(program, "_suggest_failures"):
program._suggest_failures += dspy.settings.get("suggest_failures")
# Increment assert and suggest failures to program's attributes
if hasattr(program, "_assert_failures"):
program._assert_failures += dspy.settings.get("assert_failures")
if hasattr(program, "_suggest_failures"):
program._suggest_failures += dspy.settings.get("suggest_failures")

return example_idx, example, prediction, score
except Exception:
return example_idx, example, {}, 0.0
return prediction, score

results = executor.execute(process_item, devset)
reordered_devset = [r for r in results if r is not None]
assert len(devset) == len(results)

ncorrect = sum(score for _, _, _, score in reordered_devset)
ntotal = len(reordered_devset)

if ntotal == 0:
logger.warning("No valid results to compute metrics.")
return 0.0
results = [((dspy.Prediction(), self.failure_score) if r is None else r) for r in results]
results = [(example, prediction, score) for example, (prediction, score) in zip(devset, results)]
ncorrect, ntotal = sum(score for *_, score in results), len(devset)

logger.info(f"Average Metric: {ncorrect} / {ntotal} ({round(100 * ncorrect / ntotal, 1)}%)")

predicted_devset = sorted(reordered_devset)

if return_outputs: # Handle the return_outputs logic
results = [(example, prediction, score) for _, example, prediction, score in predicted_devset]


def prediction_is_dictlike(prediction):
# Downstream logic for displaying dictionary-like predictions depends solely on the predictions
# having a method called `items()` for iterating through key/value pairs
Expand All @@ -140,12 +129,12 @@ def prediction_is_dictlike(prediction):
if prediction_is_dictlike(prediction)
else dict(example) | {"prediction": prediction, "correct": score}
)
for _, example, prediction, score in predicted_devset
for example, prediction, score in results
]

result_df = pd.DataFrame(data)

# Truncate every cell in the DataFrame (DataFrame.applymap was renamed to DataFrame.map in Pandas 2.1.0)
result_df = pd.DataFrame(data)
result_df = result_df.map(truncate_cell) if hasattr(result_df, "map") else result_df.applymap(truncate_cell)

# Rename the 'correct' column to the name of the metric object
Expand Down Expand Up @@ -179,9 +168,9 @@ def prediction_is_dictlike(prediction):
display(HTML(message))

if return_all_scores and return_outputs:
return round(100 * ncorrect / ntotal, 2), results, [score for *_, score in predicted_devset]
return round(100 * ncorrect / ntotal, 2), results, [score for *_, score in results]
if return_all_scores:
return round(100 * ncorrect / ntotal, 2), [score for *_, score in predicted_devset]
return round(100 * ncorrect / ntotal, 2), [score for *_, score in results]
if return_outputs:
return round(100 * ncorrect / ntotal, 2), results

Expand Down

0 comments on commit 44b3331

Please sign in to comment.