Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[evaluation] Add support for using evaluate() with evaluators that have missing inputs #38276

Merged
merged 15 commits into from
Nov 2, 2024
Merged
2 changes: 1 addition & 1 deletion sdk/evaluation/azure-ai-evaluation/assets.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
"AssetsRepo": "Azure/azure-sdk-assets",
"AssetsRepoPrefixPath": "python",
"TagPrefix": "python/evaluation/azure-ai-evaluation",
"Tag": "python/evaluation/azure-ai-evaluation_043418c052"
"Tag": "python/evaluation/azure-ai-evaluation_5ec60aae8a"
}
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,13 @@ def _validate_columns_for_evaluators(
# Ignore the missing fields if "conversation" presents in the input data
missing_inputs = []
else:
missing_inputs = [col for col in evaluator_params if col not in new_df.columns]
optional_params = (
evaluator._OPTIONAL_PARAMS # pylint: disable=protected-access
if hasattr(evaluator, "_OPTIONAL_PARAMS")
else []
)
excluded_params = set(new_df.columns).union(optional_params)
missing_inputs = [col for col in evaluator_params if col not in excluded_params]
diondrapeck marked this conversation as resolved.
Show resolved Hide resolved

# If "conversation" is the only parameter and it is missing, keep it in the missing inputs
# Otherwise, remove it from the missing inputs
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -179,9 +179,7 @@ async def __call__(
if not randomization_seed:
randomization_seed = randint(0, 1000000)

regular_sim = AdversarialSimulator(
azure_ai_project=self.azure_ai_project, credential=self.credential
)
regular_sim = AdversarialSimulator(azure_ai_project=self.azure_ai_project, credential=self.credential)
regular_sim_results = await regular_sim(
scenario=scenario,
target=target,
Expand Down
9 changes: 9 additions & 0 deletions sdk/evaluation/azure-ai-evaluation/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,11 +142,20 @@ def live_connection_file_values():
add_general_regex_sanitizer(regex=project_scope["project_name"], value=SanitizedValues.WORKSPACE_NAME)
add_general_regex_sanitizer(regex=model_config["azure_endpoint"], value=mock_model_config["azure_endpoint"])

def promptflow_root_run_id_sanitizer():
"""Sanitize the promptflow service isolation values."""
add_general_regex_sanitizer(
value="root_run_id",
regex=r'"root_run_id": "azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_[^"]+"',
replacement='"root_run_id": "azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_SANITIZED"',
)

azure_workspace_triad_sanitizer()
azureopenai_connection_sanitizer()
openai_stainless_default_headers()
azure_ai_generative_sanitizer()
live_connection_file_values()
promptflow_root_run_id_sanitizer()


@pytest.fixture
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,12 @@ def data_file():
return os.path.join(data_path, "evaluate_test_data.jsonl")


@pytest.fixture
def data_file_no_query():
data_path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data")
return os.path.join(data_path, "evaluate_test_data_no_query.jsonl")
diondrapeck marked this conversation as resolved.
Show resolved Hide resolved


@pytest.fixture
def data_convo_file():
data_path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data")
Expand Down Expand Up @@ -725,3 +731,91 @@ def test_evaluate_aggregation(self, data_file, return_json, aggregate_return_jso
@pytest.mark.skip(reason="TODO: Add test back")
def test_prompty_with_threadpool_implementation(self):
pass

def test_evaluate_with_groundedness_evaluator_with_query(self, model_config, data_file):
# data
input_data = pd.read_json(data_file, lines=True)

groundedness_eval = GroundednessEvaluator(model_config)

# run the evaluation
result = evaluate(
data=data_file,
evaluators={"grounded": groundedness_eval},
)

row_result_df = pd.DataFrame(result["rows"])
metrics = result["metrics"]

# validate the results
assert result is not None
assert result["rows"] is not None
assert row_result_df.shape[0] == len(input_data)
assert "outputs.grounded.groundedness" in row_result_df.columns.to_list()
assert "grounded.groundedness" in metrics.keys()
assert metrics.get("grounded.groundedness") == list_mean_nan_safe(
row_result_df["outputs.grounded.groundedness"]
)
assert row_result_df["outputs.grounded.groundedness"][2] in [3, 4, 5]
assert result["studio_url"] is None

def test_evaluate_with_groundedness_evaluator_without_query(self, model_config, data_file_no_query):
# data
input_data = pd.read_json(data_file_no_query, lines=True)

groundedness_eval = GroundednessEvaluator(model_config)

# run the evaluation
result = evaluate(
data=data_file_no_query,
evaluators={"grounded": groundedness_eval},
)

row_result_df = pd.DataFrame(result["rows"])
metrics = result["metrics"]

# validate the results
assert result is not None
assert result["rows"] is not None
assert row_result_df.shape[0] == len(input_data)

assert "outputs.grounded.groundedness" in row_result_df.columns.to_list()

assert "grounded.groundedness" in metrics.keys()

assert metrics.get("grounded.groundedness") == list_mean_nan_safe(
row_result_df["outputs.grounded.groundedness"]
)

assert row_result_df["outputs.grounded.groundedness"][2] in [3, 4, 5]
assert result["studio_url"] is None

def test_evaluate_with_groundedness_evaluator_with_convo(self, model_config, data_convo_file):
# data
input_data = pd.read_json(data_convo_file, lines=True)

groundedness_eval = GroundednessEvaluator(model_config)

# run the evaluation
result = evaluate(
data=data_convo_file,
evaluators={"grounded": groundedness_eval},
)

row_result_df = pd.DataFrame(result["rows"])
metrics = result["metrics"]

# validate the results
assert result is not None
assert result["rows"] is not None
assert row_result_df.shape[0] == len(input_data)

assert "outputs.grounded.groundedness" in row_result_df.columns.to_list()

assert "grounded.groundedness" in metrics.keys()

assert metrics.get("grounded.groundedness") == list_mean_nan_safe(
row_result_df["outputs.grounded.groundedness"]
)
assert row_result_df["outputs.grounded.groundedness"][1] in [3, 4, 5]
diondrapeck marked this conversation as resolved.
Show resolved Hide resolved
assert result["studio_url"] is None