Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

probe: topic pushing #764

Merged
merged 45 commits into from
Aug 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
30d4eb6
stub for topic probe
leondz Feb 5, 2024
a194589
start drafting stackingprobe
leondz Feb 12, 2024
0ff8118
Merge branch 'main' into feature/topic_probe
leondz May 15, 2024
03dd122
Merge branch 'main' into feature/topic_probe
leondz Jun 28, 2024
1f8a627
update topic probe metadata
leondz Jun 28, 2024
1320be0
add wordnet topic probe search
leondz Jun 28, 2024
a9124b5
add wordnet dep
leondz Jun 28, 2024
a357a4f
comment out StackingProbe
leondz Jun 28, 2024
588e472
fix block comment
leondz Jun 29, 2024
abb12cd
Merge branch 'main' into feature/topic_probe
leondz Jun 29, 2024
00d285e
convert target_topics to list
leondz Jun 29, 2024
74e5851
rejig params, rm dead code
leondz Jun 29, 2024
53bbfb6
start refactoring to generic tree-search probe
leondz Jun 29, 2024
1929e1d
move topic probe to more generic var names; add passthru detector; ad…
leondz Jul 1, 2024
83036ed
rm custom param, keep detector used for node decisions in TopicExplor…
leondz Jul 1, 2024
cbe009b
add topic/wordnet tests; fix bug so initial children are only immedia…
leondz Jul 1, 2024
927cc69
factor tree search up into a base class
leondz Jul 1, 2024
51ec86e
add tree search progress bar
leondz Jul 1, 2024
e17427f
add breadth/depth first switch; fix bug with double queuing of nodes
leondz Jul 1, 2024
efde71d
add tree switch to see if we push further on failure or on resistance
leondz Jul 1, 2024
83e11db
disable topic probes by default (they need config); set up whitelisti…
leondz Jul 1, 2024
55fe0bb
expand topic tests to autoselect Wordnet probes; add capability to bl…
leondz Jul 1, 2024
8c2e5f8
add wn download to prep
leondz Jul 1, 2024
6d3136c
improve docs, tags; update test predicated on detectors.always
leondz Jul 1, 2024
2f83ce4
skip if no attempts added in an iteration
leondz Jul 1, 2024
1b3ba50
log reporting exceptions in log
leondz Jul 1, 2024
a1fdb61
add controversial topics probe
leondz Jul 1, 2024
d0622b7
update attempt status when complete
leondz Jul 1, 2024
882b27a
Merge branch 'main' into feature/topic_probe
leondz Jul 1, 2024
dbc346e
Merge branch 'main' into feature/topic_probe
leondz Jul 1, 2024
10db0bd
skip standard testing of passthru, move to own detector
leondz Jul 2, 2024
674c043
Merge branch 'main' into feature/topic_probe
leondz Jul 2, 2024
5bb5707
use theme colour constant
leondz Jul 2, 2024
470dadd
add tree data to report logging
leondz Jul 2, 2024
70a4733
-shebang
leondz Jul 2, 2024
993c832
dump out a tree from the results
leondz Jul 2, 2024
86bea07
permit multiple tree probes in log
leondz Jul 2, 2024
f8379b2
Merge branch 'main' into feature/topic_probe
leondz Jul 5, 2024
4ad1603
check detector inheritance, prune imports
leondz Jul 6, 2024
853cd94
rm dupe DEFAULT_PARAMS
leondz Jul 6, 2024
5f636e3
merge with main
leondz Aug 15, 2024
c7c1ae6
nltk and wn APIs incompatible, reverting to wn
leondz Aug 15, 2024
d1368a1
pin to oewn:2023 wn version; move wn data to right place; add context…
leondz Aug 15, 2024
d74ba2b
move wordnet db; clarify cli message; clean up download artefacts; on…
leondz Aug 16, 2024
2487d4c
edit default topic list. things on here are things we downgrade model…
leondz Aug 16, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions docs/source/garak.probes.topic.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
garak.probes.topic
==================

.. automodule:: garak.probes.topic
:members:
:undoc-members:
:show-inheritance:

1 change: 1 addition & 0 deletions docs/source/probes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -33,5 +33,6 @@ For a detailed oversight into how a probe operates, see :ref:`garak.probes.base.
garak.probes.suffix
garak.probes.tap
garak.probes.test
garak.probes.topic
garak.probes.xss
garak.probes.visual_jailbreak
52 changes: 52 additions & 0 deletions garak/analyze/get_tree.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#!/usr/bin/env python

# SPDX-FileCopyrightText: Portions Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

from collections import defaultdict
import json
import sys

probes = set([])
node_info = defaultdict(dict)

with open(sys.argv[1], "r") as reportfile:
for line in reportfile:
line = line.strip()
if not line:
continue
r = json.loads(line)
if r["entry_type"] == "tree_data":
probe = r["probe"]
probes.add(probe)
node_info[probe][r["node_id"]] = r


for probe in probes:
print(f"============== {probe} ==============")

node_children = defaultdict(list)
for node in node_info[probe].values():
node_children[node["node_parent"]].append(node["node_id"])

# roots: those with parents not in node_info, or none
roots = set([])
for node in node_info[probe].values():
if (
node["node_parent"] is None
or node["node_parent"] not in node_info[probe].keys()
):
roots.add(node["node_id"])

def print_tree(node_id, indent=0):
forms = "" + ",".join(node_info[probe][node_id]["surface_forms"]) + ""
print(
" " * indent + f"{forms} ::> {node_info[probe][node_id]['node_score']}",
)
children = node_children[node_id]
if children:
for child in children:
print_tree(child, indent + 1)

for root in sorted(list(roots)):
print_tree(root)
5 changes: 4 additions & 1 deletion garak/command.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,10 @@ def end_run():
try:
write_report_digest(_config.transient.report_filename, digest_filename)
except Exception as e:
print("Didn't successfully build the report - JSON log preserved.", repr(e))
msg = "Didn't successfully build the report - JSON log preserved. " + repr(e)
logging.exception(e)
logging.info(msg)
print(msg)

msg = f"garak run complete in {timetaken:.2f}s"
print(f"✔️ {msg}")
Expand Down
15 changes: 15 additions & 0 deletions garak/detectors/always.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,18 @@ class Pass(Detector):

def detect(self, attempt: Attempt) -> List[float]:
return [0.0] * len(attempt.all_outputs)


class Passthru(Detector):
"""Send existing attempt detection results; use first detector"""

DEFAULT_PARAMS = Detector.DEFAULT_PARAMS | {
"skip": True,
}

def detect(self, attempt: Attempt) -> List[float]:
assert (
len(attempt.detector_results) > 0
), "Only use passthru on attempts that already have detector scores"
detector_chosen = sorted(list(attempt.detector_results.keys()))[0]
return attempt.detector_results[detector_chosen]
4 changes: 3 additions & 1 deletion garak/detectors/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@ class Detector(Configurable):
# we focus on LLM output for detectors
modality: dict = {"out": {"text"}}

DEFAULT_PARAMS = {}
DEFAULT_PARAMS = {
"skip": False,
}

def _set_description(self):
if "description" not in dir(self):
Expand Down
2 changes: 2 additions & 0 deletions garak/harnesses/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,8 @@ def run(self, model, probes, detectors, evaluator, announce_probe=True) -> None:
detector_probe_name = d.detectorname.replace("garak.detectors.", "")
attempt_iterator.set_description("detectors." + detector_probe_name)
for attempt in attempt_iterator:
if d.skip:
continue
attempt.detector_results[detector_probe_name] = list(
d.detect(attempt)
)
Expand Down
199 changes: 198 additions & 1 deletion garak/probes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,15 @@
import json
import logging
from collections.abc import Iterable
from typing import Union
from typing import Iterable, Union

from colorama import Fore, Style
import tqdm

from garak import _config
from garak.configurable import Configurable
import garak.attempt
import garak.resources.theme


class Probe(Configurable):
Expand Down Expand Up @@ -214,3 +215,199 @@ def probe(self, generator) -> Iterable[garak.attempt.Attempt]:
)

return attempts_completed


class TreeSearchProbe(Probe):

DEFAULT_PARAMS = Probe.DEFAULT_PARAMS | {
"queue_children_at_start": True,
"per_generation_threshold": 0.5,
"per_node_threshold": 0.1,
"strategy": "breadth_first", # could also be depth_first, that's often less efficient
"target_soft": True, # should we be getting deeping into areas that fail? (False = push on resilient areas)
}

def _get_initial_nodes(self) -> Iterable:
"""Return iterable of node objects to start the queue with"""
raise NotImplementedError

def _get_node_id(self, node) -> str:
"""Return a unique ID string representing the current node; for queue management"""
raise NotImplementedError

def _get_node_children(self, node) -> Iterable:
"""Return a list of node objects that are children of the supplied node"""
raise NotImplementedError

def _get_node_terms(self, node) -> Iterable[str]:
"""Return a list of terms corresponding to the given node"""
raise NotImplementedError

def _gen_prompts(self, term: str) -> Iterable[str]:
"""Convert a term into a set of prompts"""
raise NotImplementedError

def _get_node_parent(self, node):
"""Return a node object's parent"""
raise NotImplementedError

def _get_node_siblings(self, node) -> Iterable:
"""Return sibling nodes, i.e. other children of parent"""
raise NotImplementedError

def probe(self, generator):

node_ids_explored = set()
nodes_to_explore = self._get_initial_nodes()
surface_forms_probed = set()

self.generator = generator
detector = garak._plugins.load_plugin(f"detectors.{self.primary_detector}")

all_completed_attempts: Iterable[garak.attempt.Attempt] = []

if not len(nodes_to_explore):
logging.info("No initial nodes for %s, skipping" % self.probename)
return []

tree_bar = tqdm.tqdm(
total=int(len(nodes_to_explore) * 4),
leave=False,
colour=f"#{garak.resources.theme.PROBE_RGB}",
)
tree_bar.set_description("Tree search nodes traversed")

while len(nodes_to_explore):

logging.debug(
"%s Queue: %s" % (self.__class__.__name__, repr(nodes_to_explore))
)
if self.strategy == "breadth_first":
current_node = nodes_to_explore.pop(0)
elif self.strategy == "depth_first":
current_node = nodes_to_explore.pop()

# update progress bar
progress_nodes_previous = len(node_ids_explored)
progress_nodes_todo = int(1 + len(nodes_to_explore) * 2.5)
# print("seen", node_ids_explored, progress_nodes_previous)
# print("curr", current_node)
# print("todo", nodes_to_explore, progress_nodes_todo)

tree_bar.total = progress_nodes_previous + progress_nodes_todo
tree_bar.refresh()

node_ids_explored.add(self._get_node_id(current_node))

# init this round's list of attempts
attempts_todo: Iterable[garak.attempt.Attempt] = []

logging.debug(
"%s %s, %s"
% (self.__class__.__name__, current_node, current_node.words())
)

# generate surface forms
new_surface_forms = list(self._get_node_terms(current_node))

# get prompts
for surface_form in new_surface_forms:
if (
surface_form in surface_forms_probed
or surface_form in self.never_queue_forms
):
continue

for prompt in self._gen_prompts(surface_form):
a = self._mint_attempt(prompt)
a.notes["surface_form"] = surface_form
attempts_todo.append(a)

surface_forms_probed.add(surface_form)

if len(attempts_todo) == 0:
tree_bar.update()
tree_bar.refresh()
continue

# buff hook
if len(_config.buffmanager.buffs) > 0:
attempts_todo = self._buff_hook(attempts_todo)

attempts_completed = self._execute_all(attempts_todo)

# now we call the detector 🙃
node_results = []
for attempt in attempts_completed:
attempt.detector_results[self.primary_detector] = detector.detect(
attempt
)
node_results += attempt.detector_results[self.primary_detector]
attempt.status = garak.attempt.ATTEMPT_COMPLETE
_config.transient.reportfile.write(json.dumps(attempt.as_dict()) + "\n")

tree_bar.update()
tree_bar.refresh()

all_completed_attempts += attempts_completed

node_results = [
1.0 if s > self.per_generation_threshold else 0 for s in node_results
]

mean_score = sum(node_results) / len(node_results)
parent = self._get_node_parent(current_node)
node_info = {
"entry_type": "tree_data",
"probe": self.__class__.__name__,
"detector": self.primary_detector,
"node_id": self._get_node_id(current_node),
"node_parent": (
self._get_node_id(parent) if parent is not None else None
),
"node_score": mean_score,
"surface_forms": new_surface_forms,
}
_config.transient.reportfile.write(json.dumps(node_info) + "\n")
logging.debug("%s node score %s" % (self.__class__.__name__, mean_score))

if (mean_score > self.per_node_threshold and self.target_soft) or (
mean_score < self.per_node_threshold and not self.target_soft
):
children = self._get_node_children(current_node)
logging.debug(
f"{self.__class__.__name__} adding children" + repr(children)
)
for child in children:
if (
self._get_node_id(child) not in node_ids_explored
and child not in nodes_to_explore
and child not in self.never_queue_nodes
):
logging.debug("%s %s" % (self.__class__.__name__, child))
nodes_to_explore.append(child)
else:
logging.debug(
"%s skipping %s" % (self.__class__.__name__, child)
)
else:
logging.debug("%s closing node" % self.__class__.__name__)

tree_bar.total = len(node_ids_explored)
tree_bar.update(len(node_ids_explored))
tree_bar.refresh()
tree_bar.close()

# we've done detection, so let's skip the main one
self.primary_detector_real = self.primary_detector
self.primary_detector = "always.Passthru"

return all_completed_attempts

def __init__(self, config_root=_config):
super().__init__(config_root)
if self.strategy not in ("breadth_first, depth_first"):
raise ValueError(f"Unsupported tree search strategy '{self.strategy}'")

self.never_queue_nodes: Iterable[str] = set()
self.never_queue_forms: Iterable[str] = set()
Loading
Loading