Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Benchmark updates #199

Merged
merged 3 commits into from
Oct 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion browsergym/experiments/src/bgym/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@
from browsergym.core.action.highlevel import HighLevelActionSet
from browsergym.core.action.python import PythonActionSet
from browsergym.experiments.agent import Agent, AgentInfo
from browsergym.experiments.benchmark import Benchmark, HighLevelActionSetArgs, BENCHMARKS
from browsergym.experiments.benchmark import (
DEFAULT_BENCHMARKS,
Benchmark,
HighLevelActionSetArgs,
)
from browsergym.experiments.loop import (
AbstractAgentArgs,
EnvArgs,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .base import DEFAULT_BENCHMARKS, Benchmark, HighLevelActionSetArgs
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import fnmatch
import io
import logging
import pkgutil
from dataclasses import dataclass, field
from typing import Literal, Optional

Expand All @@ -10,7 +8,13 @@
from dataclasses_json import DataClassJsonMixin, config

from browsergym.core.action.highlevel import HighLevelActionSet
from browsergym.experiments.loop import SEED_MAX, EnvArgs
from browsergym.experiments.loop import EnvArgs

from .metadata.utils import task_list_from_metadata, task_metadata
from .utils import (
make_env_args_list_from_repeat_tasks,
make_env_args_list_from_workarena_curriculum,
)

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -49,6 +53,7 @@ def make_action_set(self):
class Benchmark(DataClassJsonMixin):
name: str
high_level_action_set_args: HighLevelActionSetArgs
is_multi_tab: bool
env_args_list: list[EnvArgs]
task_metadata: Optional[pd.DataFrame] = field(
default_factory=lambda: None,
Expand Down Expand Up @@ -100,6 +105,7 @@ def subset_from_regexp(self, column, regexp):
return Benchmark(
name=f"{self.name}[{column}=/{regexp}/]",
high_level_action_set_args=self.high_level_action_set_args,
is_multi_tab=self.is_multi_tab,
env_args_list=[
env_args
for env_args in self.env_args_list
Expand All @@ -109,28 +115,6 @@ def subset_from_regexp(self, column, regexp):
)


def task_metadata(benchmark_name: str):
return task_metadata_from_csv(
io.StringIO(
pkgutil.get_data(__name__, f"task_metadata/{benchmark_name}.csv").decode("utf-8")
)
)


def task_metadata_from_csv(filepath):
return pd.read_csv(filepath).fillna("")


def task_list_from_metadata(metadata: pd.DataFrame, filter: dict[str, str] = {}):
df = metadata
# filter the desired columns (AND filter)
for col_name, regex in filter.items():
col_filter = df[col_name].astype(str).str.contains(regex, regex=True)
df = df[col_filter]
# return only the task names
return list(df["task_name"])


# These are mean as the default highlevel action set to fairly evaluate agents on each benchmark.
# They are mostly arbitrary, the important thing is to evaluate different agents using the same action set for fairness.
DEFAULT_HIGHLEVEL_ACTION_SET_ARGS = {
Expand Down Expand Up @@ -166,7 +150,7 @@ def task_list_from_metadata(metadata: pd.DataFrame, filter: dict[str, str] = {})
demo_mode="off",
),
"workarena": HighLevelActionSetArgs(
subsets=["chat", "infeas", "bid"],
subsets=["chat", "infeas", "bid", "tab", "nav"],
multiaction=False,
strict=False,
retry_with_force=False,
Expand Down Expand Up @@ -197,12 +181,13 @@ def task_list_from_metadata(metadata: pd.DataFrame, filter: dict[str, str] = {})
),
}

# all benchmarks are callables designed for lazy loading, i.e. `bench = BENCHMARKS["miniwob_all"]()`
BENCHMARKS = {
# all benchmarks are callables designed for lazy loading, i.e. `bench = DEFAULT_BENCHMARKS["miniwob_all"]()`
DEFAULT_BENCHMARKS = {
"miniwob": lambda: Benchmark(
name="miniwob",
high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["miniwob"],
env_args_list=_make_env_args_list_from_repeat_tasks(
is_multi_tab=False,
env_args_list=make_env_args_list_from_repeat_tasks(
task_list=task_list_from_metadata(metadata=task_metadata("miniwob")),
max_steps=10,
n_repeats=5,
Expand All @@ -213,7 +198,8 @@ def task_list_from_metadata(metadata: pd.DataFrame, filter: dict[str, str] = {})
"miniwob_tiny_test": lambda: Benchmark(
name="miniwob_tiny_test",
high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["miniwob"],
env_args_list=_make_env_args_list_from_repeat_tasks(
is_multi_tab=False,
env_args_list=make_env_args_list_from_repeat_tasks(
task_list=["miniwob.click-dialog", "miniwob.click-checkboxes"],
max_steps=5,
n_repeats=2,
Expand All @@ -224,7 +210,8 @@ def task_list_from_metadata(metadata: pd.DataFrame, filter: dict[str, str] = {})
"webarena": lambda: Benchmark(
name="webarena",
high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["webarena"],
env_args_list=_make_env_args_list_from_repeat_tasks(
is_multi_tab=True,
env_args_list=make_env_args_list_from_repeat_tasks(
task_list=task_list_from_metadata(metadata=task_metadata("webarena")),
max_steps=15,
n_repeats=1,
Expand All @@ -235,7 +222,8 @@ def task_list_from_metadata(metadata: pd.DataFrame, filter: dict[str, str] = {})
"visualwebarena": lambda: Benchmark(
name="visualwebarena",
high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["visualwebarena"],
env_args_list=_make_env_args_list_from_repeat_tasks(
is_multi_tab=True,
env_args_list=make_env_args_list_from_repeat_tasks(
task_list=task_list_from_metadata(metadata=task_metadata("visualwebarena")),
max_steps=15,
n_repeats=1,
Expand All @@ -246,7 +234,8 @@ def task_list_from_metadata(metadata: pd.DataFrame, filter: dict[str, str] = {})
"workarena_l1": lambda: Benchmark(
name="workarena_l1",
high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["workarena_l1"],
env_args_list=_make_env_args_list_from_workarena_curriculum(
is_multi_tab=False,
env_args_list=make_env_args_list_from_workarena_curriculum(
level="l1",
task_category_filter=None,
meta_seed=42, # meta seed for evaluation curriculum
Expand All @@ -259,7 +248,8 @@ def task_list_from_metadata(metadata: pd.DataFrame, filter: dict[str, str] = {})
"workarena_l2_agent_curriculum_eval": lambda: Benchmark(
name="workarena_l2_agent_curriculum_eval",
high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["workarena"],
env_args_list=_make_env_args_list_from_workarena_curriculum(
is_multi_tab=True,
env_args_list=make_env_args_list_from_workarena_curriculum(
level="l2",
task_category_filter=None,
meta_seed=42, # meta seed for evaluation curriculum
Expand All @@ -271,7 +261,8 @@ def task_list_from_metadata(metadata: pd.DataFrame, filter: dict[str, str] = {})
"workarena_l3_agent_curriculum_eval": lambda: Benchmark(
name="workarena_l3_agent_curriculum_eval",
high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["workarena"],
env_args_list=_make_env_args_list_from_workarena_curriculum(
is_multi_tab=True,
env_args_list=make_env_args_list_from_workarena_curriculum(
level="l3",
task_category_filter=None,
meta_seed=42, # meta seed for evaluation curriculum
Expand All @@ -283,7 +274,8 @@ def task_list_from_metadata(metadata: pd.DataFrame, filter: dict[str, str] = {})
"assistantbench": lambda: Benchmark(
name="assistantbench",
high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["assistantbench"],
env_args_list=_make_env_args_list_from_repeat_tasks(
is_multi_tab=True,
env_args_list=make_env_args_list_from_repeat_tasks(
task_list=task_list_from_metadata(
metadata=task_metadata("assistantbench"), filter={"browsergym_split": "valid|test"}
),
Expand All @@ -294,62 +286,3 @@ def task_list_from_metadata(metadata: pd.DataFrame, filter: dict[str, str] = {})
task_metadata=task_metadata("assistantbench"),
),
}


def _make_env_args_list_from_workarena_curriculum(
level: Literal["l1", "l2", "l3"],
task_category_filter: str,
meta_seed: int,
max_steps: int,
curriculum_type: Literal["human", "agent"],
seeds_l1: int = 10,
):
"""
Returns a WorkArena predefined task curriculum (e.g., task and seed combination).
"""
assert level in ("l1", "l2", "l3")
assert curriculum_type in ("human", "agent")

env_args_list = []

from browsergym.workarena import get_all_tasks_agents

all_task_tuples = get_all_tasks_agents(
filter=f"{level}.{task_category_filter}" if task_category_filter else level,
meta_seed=meta_seed,
is_agent_curriculum=(curriculum_type == "agent"),
n_seed_l1=seeds_l1,
)

for task, seed in all_task_tuples:
task_name = task.get_task_id()
env_args_list.append(EnvArgs(task_name=task_name, task_seed=seed, max_steps=max_steps))

return env_args_list


def _make_env_args_list_from_repeat_tasks(
task_list: list[str], max_steps: int, n_repeats: int, seeds_rng: np.random.RandomState
):
"""
Generates a list of `len(task_list)` time `n_repeats` environments arguments, using randomly generated seeds.
"""
env_args_list = []
for task in task_list:
for seed in seeds_rng.randint(low=0, high=SEED_MAX, size=n_repeats):
env_args_list.append(
EnvArgs(
task_name=task,
task_seed=int(seed),
max_steps=max_steps,
headless=True,
record_video=False,
wait_for_user_message=False,
viewport=None,
slow_mo=None,
storage_state=None,
task_kwargs=None,
)
)

return env_args_list
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import io
import pkgutil

import pandas as pd


def task_metadata(benchmark_name: str):
return task_metadata_from_csv(
io.StringIO(pkgutil.get_data(__name__, f"{benchmark_name}.csv").decode("utf-8"))
)


def task_metadata_from_csv(filepath):
return pd.read_csv(filepath).fillna("")


def task_list_from_metadata(metadata: pd.DataFrame, filter: dict[str, str] = {}):
df = metadata
# filter the desired columns (AND filter)
for col_name, regex in filter.items():
col_filter = df[col_name].astype(str).str.contains(regex, regex=True)
df = df[col_filter]
# return only the task names
return list(df["task_name"])
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import io
import pkgutil
from typing import Literal

import numpy as np
import pandas as pd

from browsergym.experiments.loop import SEED_MAX, EnvArgs


def make_env_args_list_from_workarena_curriculum(
level: Literal["l1", "l2", "l3"],
task_category_filter: str,
meta_seed: int,
max_steps: int,
curriculum_type: Literal["human", "agent"],
seeds_l1: int = 10,
):
"""
Returns a WorkArena predefined task curriculum (e.g., task and seed combination).
"""
assert level in ("l1", "l2", "l3")
assert curriculum_type in ("human", "agent")

env_args_list = []

# dynamic import
from browsergym.workarena import get_all_tasks_agents

all_task_tuples = get_all_tasks_agents(
filter=f"{level}.{task_category_filter}" if task_category_filter else level,
meta_seed=meta_seed,
is_agent_curriculum=(curriculum_type == "agent"),
n_seed_l1=seeds_l1,
)

for task, seed in all_task_tuples:
task_name = task.get_task_id()
env_args_list.append(EnvArgs(task_name=task_name, task_seed=seed, max_steps=max_steps))

return env_args_list


def make_env_args_list_from_repeat_tasks(
task_list: list[str], max_steps: int, n_repeats: int, seeds_rng: np.random.RandomState
):
"""
Generates a list of `len(task_list)` time `n_repeats` environments arguments, using randomly generated seeds.
"""
env_args_list = []
for task in task_list:
for seed in seeds_rng.randint(low=0, high=SEED_MAX, size=n_repeats):
env_args_list.append(
EnvArgs(
task_name=task,
task_seed=int(seed),
max_steps=max_steps,
headless=True,
record_video=False,
wait_for_user_message=False,
viewport=None,
slow_mo=None,
storage_state=None,
task_kwargs=None,
)
)

return env_args_list


def make_env_args_list_from_fixed_seeds(
task_list: list[str], max_steps: int, fixed_seeds: list[int]
):
"""
Generates a list of `len(task_list)` time `n_repeats` environments arguments, using randomly generated seeds.
"""
env_args_list = []
for task in task_list:
for seed in fixed_seeds:
env_args_list.append(
EnvArgs(
task_name=task,
task_seed=int(seed),
max_steps=max_steps,
headless=True,
record_video=False,
wait_for_user_message=False,
viewport=None,
slow_mo=None,
storage_state=None,
task_kwargs=None,
)
)

return env_args_list
2 changes: 1 addition & 1 deletion tests/assistantbench/test_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pytest

from browsergym.assistantbench.evaluation.evaluator import question_scorer
from browsergym.experiments.benchmark import task_list_from_metadata, task_metadata
from browsergym.experiments.benchmark.base import task_list_from_metadata, task_metadata

__DATA_DIR = pathlib.Path(__file__).resolve().parent / "data"

Expand Down
Loading