From a265ec13854df858955a3aefc77329c833805d26 Mon Sep 17 00:00:00 2001 From: Hunter Heston Date: Wed, 11 Dec 2024 14:40:38 -0800 Subject: [PATCH] feat(js/plugins/checks): checks evaluator plugin returns multiple scores (#1370) --- .gitignore | 1 + js/plugins/checks/README.md | 11 ++------ js/plugins/checks/src/evaluation.ts | 43 +++++++++++++++-------------- 3 files changed, 26 insertions(+), 29 deletions(-) diff --git a/.gitignore b/.gitignore index d53fc1823..46736ee1c 100644 --- a/.gitignore +++ b/.gitignore @@ -21,6 +21,7 @@ js/testapps/firebase-functions-sample1/.firebase js/testapps/firebase-functions-sample1/.firebaserc js/testapps/firebase-functions-sample1/public/bundle.js js/testapps/firebase-functions-sample1/public/config.js +.genkit js/**/.genkit samples/**/.genkit go/**/.genkit diff --git a/js/plugins/checks/README.md b/js/plugins/checks/README.md index 5cf804438..9b26f2fab 100644 --- a/js/plugins/checks/README.md +++ b/js/plugins/checks/README.md @@ -78,16 +78,11 @@ Create a JSON file with the data you want to test. Add as many test cases as you ``` -### Run the evaluators +### Run the evaluator ```bash -# Run just the DANGEROUS_CONTENT classifier. -genkit eval:run test-dataset.json --evaluators=checks/dangerous_content -``` - -```bash -# Run all classifiers. -genkit eval:run test-dataset.json --evaluators=checks/dangerous_content,checks/pii_soliciting_reciting,checks/harassment,checks/sexually_explicit,checks/hate_speech,checks/medical_info,checks/violence_and_gore,checks/obscenity_and_profanity +# Run all configured classifiers. +genkit eval:run test-dataset.json --evaluators=checks/guardrails ``` ### View the results diff --git a/js/plugins/checks/src/evaluation.ts b/js/plugins/checks/src/evaluation.ts index 58ad2b115..a3eb9afcc 100644 --- a/js/plugins/checks/src/evaluation.ts +++ b/js/plugins/checks/src/evaluation.ts @@ -64,7 +64,7 @@ export function checksEvaluators( auth: GoogleAuth, metrics: ChecksEvaluationMetric[], projectId: string -): EvaluatorAction[] { +): EvaluatorAction { const policy_configs: ChecksEvaluationMetricConfig[] = metrics.map( (metric) => { const metricType = isConfig(metric) ? metric.type : metric; @@ -77,11 +77,7 @@ export function checksEvaluators( } ); - const evaluators = policy_configs.map((policy_config) => { - return createPolicyEvaluator(projectId, auth, ai, policy_config); - }); - - return evaluators; + return createPolicyEvaluator(projectId, auth, ai, policy_configs); } function isConfig( @@ -104,15 +100,13 @@ function createPolicyEvaluator( projectId: string, auth: GoogleAuth, ai: Genkit, - policy_config: ChecksEvaluationMetricConfig + policy_config: ChecksEvaluationMetricConfig[] ): EvaluatorAction { - const policyType = policy_config.type as string; - return ai.defineEvaluator( { - name: `checks/${policyType.toLowerCase()}`, - displayName: policyType, - definition: `Evaluates text against the Checks ${policyType} policy.`, + name: 'checks/guardrails', + displayName: 'checks/guardrails', + definition: `Evaluates input text against the Checks ${policy_config.map((policy) => policy.type)} policies.`, }, async (datapoint: BaseEvalDataPoint) => { const partialRequest = { @@ -121,10 +115,12 @@ function createPolicyEvaluator( content: datapoint.output as string, }, }, - policies: { - policy_type: policy_config.type, - threshold: policy_config.threshold, - }, + policies: policy_config.map((config) => { + return { + policy_type: config.type, + threshold: config.threshold, + }; + }), }; const response = await checksEvalInstance( @@ -134,13 +130,18 @@ function createPolicyEvaluator( ResponseSchema ); - return { - evaluation: { - score: response.policyResults[0].score, + const evaluationResults = response.policyResults.map((result) => { + return { + id: result.policyType, + score: result.score, details: { - reasoning: response.policyResults[0].violationResult, + reasoning: `Status ${result.violationResult}`, }, - }, + }; + }); + + return { + evaluation: evaluationResults, testCaseId: datapoint.testCaseId, }; }