Skip to content

Commit

Permalink
Merge branch 'main' into workarena_action_set
Browse files Browse the repository at this point in the history
  • Loading branch information
ThibaultLSDC authored Oct 23, 2024
2 parents 7216796 + c7f77ba commit 73cdb19
Show file tree
Hide file tree
Showing 8 changed files with 39,236 additions and 2,571 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -69,21 +69,21 @@ def __post_init__(self):
metadata_tasks = list(self.task_metadata["task_name"])
assert all([env_args.task_name in metadata_tasks for env_args in self.env_args_list])

def subset_from_split(self, split: Literal["train", "eval", "test"]):
split_column = "split"
def subset_from_split(self, split: Literal["train", "valid", "test"]):
split_column = "browsergym_split"

# check for a split column in metadata
if not split_column in self.task_metadata.columns:
raise NotImplementedError(
f"This benchmark does not provide train/eval/test {split_column} (missing split column in task metadata)"
f"This benchmark does not provide default train/valid/test splits (missing a {repr(split_column)} column in task metadata)"
)

# recover the target split
sub_benchmark = self.subset_from_regexp(split_column, regexp=f"^{split}$")

# check that the split exists (non-empty task list)
if not sub_benchmark.env_args_list:
raise ValueError(f"The {split} split for this benchmark is empty.")
raise ValueError(f"The default {split} split for this benchmark is empty.")

return sub_benchmark

Expand Down
16 changes: 15 additions & 1 deletion browsergym/experiments/src/browsergym/experiments/loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@ class ExpArgs:
stack_trace: str = None
order: int = None # use to keep the original order the experiments were meant to be launched.
logging_level: int = logging.INFO
logging_level_stdout: int = logging.INFO
exp_id: str = None
depends_on: tuple[str] = ()
save_screenshot: bool = True
Expand Down Expand Up @@ -289,12 +290,25 @@ def _set_logger(self):
# output logging traces to a log file
file_handler = logging.FileHandler(self.exp_dir / "experiment.log")
file_handler.setLevel(self.logging_level) # same level as console outputs
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
formatter = logging.Formatter(
"%(asctime)s - %(process)d - %(name)s - %(levelname)s - %(message)s"
)
file_handler.setFormatter(formatter)
# output handler
stream_handler = logging.StreamHandler()
stream_handler.setLevel(self.logging_level_stdout)
stream_handler.setFormatter(formatter)
# setup root logger
root_logger = logging.getLogger()

# remove previous stream handlers
for handler in root_logger.handlers:
if isinstance(handler, logging.StreamHandler):
root_logger.removeHandler(handler)

root_logger.setLevel(self.logging_level)
root_logger.addHandler(file_handler)
root_logger.addHandler(stream_handler)
# setup openai logger (don't go below INFO verbosity)
openai_logger = logging.getLogger("openai._base_client")
openai_logger.setLevel(max(logging.INFO, self.logging_level))
Expand Down
Original file line number Diff line number Diff line change
@@ -1,126 +1,126 @@
task_name,miniwob_category,comment,webgum_subset
miniwob.ascending-numbers,hidden test,,False
miniwob.bisect-angle,original,,False
miniwob.book-flight,original,delay,True
miniwob.book-flight-nodelay,nodelay,,False
miniwob.buy-ticket,hidden test,,False
miniwob.choose-date,original,delay,True
miniwob.choose-date-easy,debug,delay,True
miniwob.choose-date-medium,debug,delay,True
miniwob.choose-date-nodelay,nodelay,,False
miniwob.choose-list,original,,True
miniwob.circle-center,original,,False
miniwob.click-button,original,,True
miniwob.click-button-sequence,original,,True
miniwob.click-checkboxes,original,,True
miniwob.click-checkboxes-large,additional,,True
miniwob.click-checkboxes-soft,additional,,True
miniwob.click-checkboxes-transfer,additional,,True
miniwob.click-collapsible,original,delay,True
miniwob.click-collapsible-2,original,delay,True
miniwob.click-collapsible-2-nodelay,nodelay,,False
miniwob.click-collapsible-nodelay,nodelay,,False
miniwob.click-color,original,,True
miniwob.click-dialog,original,,True
miniwob.click-dialog-2,original,,True
miniwob.click-link,original,,True
miniwob.click-menu,original,,True
miniwob.click-menu-2,original,,False
miniwob.click-option,original,,True
miniwob.click-pie,original,delay,True
miniwob.click-pie-nodelay,nodelay,,False
miniwob.click-scroll-list,original,,True
miniwob.click-shades,original,,True
miniwob.click-shape,original,,True
miniwob.click-tab,original,,True
miniwob.click-tab-2,original,,True
miniwob.click-tab-2-easy,debug,,False
miniwob.click-tab-2-hard,additional,,True
miniwob.click-tab-2-medium,debug,,False
miniwob.click-test,original,,True
miniwob.click-test-2,original,,True
miniwob.click-test-transfer,debug,,False
miniwob.click-widget,original,,True
miniwob.copy-paste,original,,False
miniwob.copy-paste-2,original,,False
miniwob.count-shape,original,,True
miniwob.count-sides,original,,False
miniwob.daily-calendar,hidden test,,False
miniwob.drag-box,original,,False
miniwob.drag-circle,original,,False
miniwob.drag-cube,original,,False
miniwob.drag-items,original,,False
miniwob.drag-items-grid,original,,False
miniwob.drag-shapes,original,,False
miniwob.drag-shapes-2,hidden test,,False
miniwob.drag-single-shape,hidden test,,False
miniwob.drag-sort-numbers,original,,False
miniwob.draw-circle,hidden test,,False
miniwob.draw-line,hidden test,,False
miniwob.email-inbox,original,,True
miniwob.email-inbox-delete,debug,,False
miniwob.email-inbox-forward,debug,,False
miniwob.email-inbox-forward-nl,additional,,True
miniwob.email-inbox-forward-nl-turk,additional,,True
miniwob.email-inbox-important,debug,,False
miniwob.email-inbox-nl-turk,additional,,True
miniwob.email-inbox-noscroll,debug,,False
miniwob.email-inbox-reply,debug,,False
miniwob.email-inbox-star-reply,debug,,False
miniwob.enter-date,original,,True
miniwob.enter-password,original,,True
miniwob.enter-text,original,,True
miniwob.enter-text-2,original,,False
miniwob.enter-text-dynamic,original,,True
miniwob.enter-time,original,,True
miniwob.find-greatest,hidden test,,False
miniwob.find-midpoint,original,,False
miniwob.find-word,original,,False
miniwob.focus-text,original,,True
miniwob.focus-text-2,original,,True
miniwob.form-sequence,hidden test,,False
miniwob.form-sequence-2,hidden test,,False
miniwob.form-sequence-3,hidden test,,False
miniwob.generate-number,hidden test,,False
miniwob.grid-coordinate,original,,True
miniwob.guess-number,original,,True
miniwob.highlight-text,original,,False
miniwob.highlight-text-2,original,,False
miniwob.hot-cold,hidden test,,False
miniwob.identify-shape,original,,True
miniwob.login-user,original,,True
miniwob.login-user-popup,additional,,True
miniwob.multi-layouts,additional,,True
miniwob.multi-orderings,additional,,True
miniwob.navigate-tree,original,,True
miniwob.number-checkboxes,original,,False
miniwob.odd-or-even,hidden test,,False
miniwob.order-food,hidden test,,False
miniwob.phone-book,hidden test,,False
miniwob.read-table,original,,False
miniwob.read-table-2,original,,False
miniwob.resize-textarea,original,,False
miniwob.right-angle,original,,False
miniwob.scroll-text,original,,False
miniwob.scroll-text-2,original,,False
miniwob.search-engine,original,,True
miniwob.sign-agreement,hidden test,,False
miniwob.simple-algebra,original,,False
miniwob.simple-arithmetic,original,,False
miniwob.social-media,original,,True
miniwob.social-media-all,additional,,True
miniwob.social-media-some,additional,,True
miniwob.stock-market,hidden test,delay,False
miniwob.terminal,original,,False
miniwob.text-editor,original,,False
miniwob.text-transform,original,,False
miniwob.tic-tac-toe,original,,True
miniwob.unicode-test,debug,,False
miniwob.use-autocomplete,original,delay,True
miniwob.use-autocomplete-nodelay,nodelay,,False
miniwob.use-colorwheel,original,,False
miniwob.use-colorwheel-2,original,,False
miniwob.use-slider,original,,False
miniwob.use-slider-2,original,,False
miniwob.use-spinner,original,,True
miniwob.visual-addition,original,,False
task_name,miniwob_category,comment,webgum_subset,similarity_group,browsergym_split
miniwob.ascending-numbers,hidden test,,False,0,test
miniwob.bisect-angle,original,,False,1,test
miniwob.book-flight,original,delay,True,2,train
miniwob.book-flight-nodelay,nodelay,,False,2,train
miniwob.buy-ticket,hidden test,,False,3,train
miniwob.choose-date,original,delay,True,4,train
miniwob.choose-date-easy,debug,delay,True,4,train
miniwob.choose-date-medium,debug,delay,True,4,train
miniwob.choose-date-nodelay,nodelay,,False,4,train
miniwob.choose-list,original,,True,5,test
miniwob.circle-center,original,,False,6,train
miniwob.click-button,original,,True,7,test
miniwob.click-button-sequence,original,,True,8,test
miniwob.click-checkboxes,original,,True,9,train
miniwob.click-checkboxes-large,additional,,True,9,train
miniwob.click-checkboxes-soft,additional,,True,9,train
miniwob.click-checkboxes-transfer,additional,,True,9,train
miniwob.click-collapsible,original,delay,True,10,train
miniwob.click-collapsible-2,original,delay,True,10,train
miniwob.click-collapsible-2-nodelay,nodelay,,False,10,train
miniwob.click-collapsible-nodelay,nodelay,,False,10,train
miniwob.click-color,original,,True,11,test
miniwob.click-dialog,original,,True,12,train
miniwob.click-dialog-2,original,,True,12,train
miniwob.click-link,original,,True,13,test
miniwob.click-menu,original,,True,14,test
miniwob.click-menu-2,original,,False,14,test
miniwob.click-option,original,,True,15,train
miniwob.click-pie,original,delay,True,16,test
miniwob.click-pie-nodelay,nodelay,,False,16,test
miniwob.click-scroll-list,original,,True,17,test
miniwob.click-shades,original,,True,18,train
miniwob.click-shape,original,,True,19,test
miniwob.click-tab,original,,True,20,test
miniwob.click-tab-2,original,,True,21,test
miniwob.click-tab-2-easy,debug,,False,21,test
miniwob.click-tab-2-hard,additional,,True,21,test
miniwob.click-tab-2-medium,debug,,False,21,test
miniwob.click-test,original,,True,22,test
miniwob.click-test-2,original,,True,23,train
miniwob.click-test-transfer,debug,,False,23,train
miniwob.click-widget,original,,True,24,test
miniwob.copy-paste,original,,False,25,test
miniwob.copy-paste-2,original,,False,25,test
miniwob.count-shape,original,,True,26,train
miniwob.count-sides,original,,False,27,test
miniwob.daily-calendar,hidden test,,False,28,train
miniwob.drag-box,original,,False,29,train
miniwob.drag-circle,original,,False,30,test
miniwob.drag-cube,original,,False,31,test
miniwob.drag-items,original,,False,32,train
miniwob.drag-items-grid,original,,False,33,test
miniwob.drag-shapes,original,,False,29,train
miniwob.drag-shapes-2,hidden test,,False,29,train
miniwob.drag-single-shape,hidden test,,False,34,train
miniwob.drag-sort-numbers,original,,False,32,train
miniwob.draw-circle,hidden test,,False,35,train
miniwob.draw-line,hidden test,,False,36,train
miniwob.email-inbox,original,,True,37,test
miniwob.email-inbox-delete,debug,,False,37,test
miniwob.email-inbox-forward,debug,,False,37,test
miniwob.email-inbox-forward-nl,additional,,True,37,test
miniwob.email-inbox-forward-nl-turk,additional,,True,37,test
miniwob.email-inbox-important,debug,,False,37,test
miniwob.email-inbox-nl-turk,additional,,True,37,test
miniwob.email-inbox-noscroll,debug,,False,37,test
miniwob.email-inbox-reply,debug,,False,37,test
miniwob.email-inbox-star-reply,debug,,False,37,test
miniwob.enter-date,original,,True,38,train
miniwob.enter-password,original,,True,38,train
miniwob.enter-text,original,,True,38,train
miniwob.enter-text-2,original,,False,38,train
miniwob.enter-text-dynamic,original,,True,38,train
miniwob.enter-time,original,,True,38,train
miniwob.find-greatest,hidden test,,False,39,test
miniwob.find-midpoint,original,,False,40,train
miniwob.find-word,original,,False,41,test
miniwob.focus-text,original,,True,42,train
miniwob.focus-text-2,original,,True,42,train
miniwob.form-sequence,hidden test,,False,43,train
miniwob.form-sequence-2,hidden test,,False,44,test
miniwob.form-sequence-3,hidden test,,False,45,test
miniwob.generate-number,hidden test,,False,46,test
miniwob.grid-coordinate,original,,True,47,test
miniwob.guess-number,original,,True,48,test
miniwob.highlight-text,original,,False,49,test
miniwob.highlight-text-2,original,,False,49,test
miniwob.hot-cold,hidden test,,False,50,test
miniwob.identify-shape,original,,True,51,train
miniwob.login-user,original,,True,52,train
miniwob.login-user-popup,additional,,True,52,train
miniwob.multi-layouts,additional,,True,53,train
miniwob.multi-orderings,additional,,True,53,train
miniwob.navigate-tree,original,,True,54,train
miniwob.number-checkboxes,original,,False,55,test
miniwob.odd-or-even,hidden test,,False,56,train
miniwob.order-food,hidden test,,False,57,test
miniwob.phone-book,hidden test,,False,58,test
miniwob.read-table,original,,False,59,train
miniwob.read-table-2,original,,False,59,train
miniwob.resize-textarea,original,,False,60,test
miniwob.right-angle,original,,False,61,test
miniwob.scroll-text,original,,False,62,test
miniwob.scroll-text-2,original,,False,62,test
miniwob.search-engine,original,,True,63,train
miniwob.sign-agreement,hidden test,,False,64,test
miniwob.simple-algebra,original,,False,65,test
miniwob.simple-arithmetic,original,,False,65,test
miniwob.social-media,original,,True,66,test
miniwob.social-media-all,additional,,True,66,test
miniwob.social-media-some,additional,,True,66,test
miniwob.stock-market,hidden test,delay,False,67,test
miniwob.terminal,original,,False,68,test
miniwob.text-editor,original,,False,69,train
miniwob.text-transform,original,,False,70,train
miniwob.tic-tac-toe,original,,True,71,test
miniwob.unicode-test,debug,,False,72,train
miniwob.use-autocomplete,original,delay,True,73,train
miniwob.use-autocomplete-nodelay,nodelay,,False,73,train
miniwob.use-colorwheel,original,,False,74,train
miniwob.use-colorwheel-2,original,,False,74,train
miniwob.use-slider,original,,False,75,train
miniwob.use-slider-2,original,,False,75,train
miniwob.use-spinner,original,,True,76,test
miniwob.visual-addition,original,,False,77,train
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
import importlib.resources
import json

import numpy as np

from browsergym.experiments.benchmark import task_metadata

# for posterity


def print_metadata_workarena():
from browsergym.workarena import (
Expand Down Expand Up @@ -87,3 +93,69 @@ def print_metadata_visualwebarena():
)

print("\n".join([",".join(x) for x in metadata]))


def print_miniwob_train_test_splits():
metadata = task_metadata("miniwob")

groups = metadata["similarity_group"]
group_counts = groups.value_counts(sort=False)

group_counts = dict({group: count for group, count in zip(group_counts.index, group_counts)})

free_groups = set(group_counts.keys())
train_groups = set()
test_groups = set()
rng = np.random.RandomState(1337)

# slack for train / test size equality
slack = sum(group_counts.values()) % 2

def move_random_group(from_groups: set, to_groups: set):
# pick uniformly among tasks (weighted sampling among groups)
probs = np.asarray([float(group_counts[group]) for group in from_groups])
probs = probs / probs.sum()
# sample a group
group = rng.choice(list(from_groups), size=1, p=probs)[0]
# move between sets
to_groups.add(group)
from_groups.remove(group)
# return group for information
return group

done = False
while not done:
n_train = sum([group_counts[group] for group in train_groups])
n_test = sum([group_counts[group] for group in test_groups])

print(f"train/test split: {n_train} <> {n_test}")

# growing phase
if free_groups:
if n_train < n_test:
group = move_random_group(from_groups=free_groups, to_groups=train_groups)
print(f"adding {group} to train")
else:
group = move_random_group(from_groups=free_groups, to_groups=test_groups)
print(f"adding {group} to test")

# group switching phase
elif n_train < n_test - slack:
group = move_random_group(from_groups=test_groups, to_groups=train_groups)
print(f"switching {group} from test to train")
elif n_test < n_train - slack:
group = move_random_group(from_groups=train_groups, to_groups=test_groups)
print(f"switching {group} from train to test")

# done (equilibrium)
else:
print("equilibrium")
done = True

print()

metadata["browsergym_split"] = metadata["similarity_group"].apply(
lambda group: "train" if group in train_groups else "test" if group in test_groups else ""
)

print(metadata.to_csv(index=False))
Loading

0 comments on commit 73cdb19

Please sign in to comment.