ServiceNow · gasse · Oct 8, 2024 · Aug 9, 2024 · Aug 9, 2024 · Aug 9, 2024
diff --git a/README.md b/README.md
@@ -158,24 +158,44 @@ print("\n".join(env_ids))
 
 ## Demo
 
-If you want to experiment with an agent in BrowserGym, follow these steps:
+If you want to experiment with a demo agent in BrowserGym, follow these steps:
 
 ```sh
 cd demo-agent
-conda env create -f environment.yml; conda activate demo-agent
+conda env create -f environment.yml
+conda activate demo-agent
 # or simply use `pip install -r requirements.txt`
 playwright install chromium
 ```
 
-Optional: Set your `OPENAI_API_KEY` to use a GPT agent.
-
-Launch the demo on the open web:
+Our demo agent uses `openai` as a backend, be sure to set your `OPENAI_API_KEY`.
 
+Launch the demo agent on the open web:
 ```sh
 python run_demo.py --task_name openended --start_url https://www.google.com
 ```
 
-You can customize your experience by changing the `model_name` to your preferred LLM, toggling Chain-of-thought with `use_thinking`, adding screenshots for your VLMs with `use_screenshot`, and much more!
+Or use it to solve a simple MiniWoB task:
+```sh
+python run_demo.py --task_name miniwob.click-test
+```
+
+A VisualWebArena task:
+```sh
+python run_demo.py --task_name visualwebarena.398
+```
+
+A WebArena task:
+```sh
+python run_demo.py --task_name webarena.4
+```
+
+A WorkArena task:
+```sh
+python run_demo.py --task_name workarena.servicenow.order-standard-laptop
+```
+
+You can customize your experience by changing the `model_name` to your preferred LLM (it uses `gpt-4o-mini` by default), adding screenshots for your VLMs with `use_screenshot`, and much more! (see `python run_demo.py --help`)
 
 
 ## Citing This Work

diff --git a/browsergym/core/src/browsergym/core/env.py b/browsergym/core/src/browsergym/core/env.py
@@ -1,38 +1,54 @@
 import copy
-import gymnasium as gym
 import logging
-import numpy as np
-import playwright.sync_api
-import time
 import re
-
+import time
 from abc import ABC
 from pathlib import Path
-from typing import Optional, Literal
+from typing import Literal, Optional
 
+import gymnasium as gym
+import numpy as np
+import playwright.sync_api
+
+from . import _get_global_playwright
+from .action.base import execute_python_code
+from .action.highlevel import HighLevelActionSet
 from .chat import Chat
-from .task import AbstractBrowserTask
-from .spaces import Unicode, AnyDict, AnyBox
-from .constants import TEXT_MAX_LENGTH, BROWSERGYM_ID_ATTRIBUTE, EXTRACT_OBS_MAX_TRIES
+from .constants import BROWSERGYM_ID_ATTRIBUTE, EXTRACT_OBS_MAX_TRIES, TEXT_MAX_LENGTH
 from .observation import (
-    _pre_extract,
+    MarkingError,
     _post_extract,
-    extract_screenshot,
-    extract_dom_snapshot,
+    _pre_extract,
     extract_dom_extra_properties,
-    extract_merged_axtree,
+    extract_dom_snapshot,
     extract_focused_element_bid,
-    MarkingError,
+    extract_merged_axtree,
+    extract_screenshot,
 )
-from .action.base import execute_python_code
-from .action.highlevel import HighLevelActionSet
-from .action.base import execute_python_code
-from . import _get_global_playwright
-
+from .spaces import AnyBox, AnyDict, Unicode
+from .task import AbstractBrowserTask
 
 logger = logging.getLogger(__name__)
 
 
+def _try_to_extract_legacy_goal(goal: list):
+    legacy_goal_strings = []
+    for message in goal:
+        if message["type"] == "text":
+            legacy_goal_strings.append(message["text"])
+        else:
+            logger.debug(
+                f"Message type {repr(message['type'])} present in the goal, cannot be converted to legacy text-only format."
+            )
+            legacy_goal_strings.append(
+                'WARNING: This goal cannot be converted to a text-only goal format. Use the new goal format instead ("goal_object" field). Any agent reading this should abort immediately.'
+            )
+            break
+    legacy_goal = "\n".join(legacy_goal_strings)
+
+    return legacy_goal
+
+
 class BrowserEnv(gym.Env, ABC):
     """The main BrowserGym class, which encapsulates instruction-following Web browsing into a Gymnasium environment."""
 
@@ -120,11 +136,8 @@ def __init__(
                         }
                     )
                 ),
-                # TODO: this is redundant with chat messages, to be removed
                 "goal": Unicode(min_length=0, max_length=TEXT_MAX_LENGTH),
-                "goal_image_urls": gym.spaces.Sequence(
-                    Unicode(min_length=0, max_length=TEXT_MAX_LENGTH)
-                ),
+                "goal_object": gym.spaces.Sequence(AnyDict()),
                 "open_pages_urls": gym.spaces.Sequence(
                     Unicode(min_length=0, max_length=TEXT_MAX_LENGTH)
                 ),
@@ -266,27 +279,42 @@ def override_property(task, env, property):
         recording_start_time = time.time()
 
         # setup the task
-        goal, task_info = self.task.setup(page=self.page)
+        task_goal, task_info = self.task.setup(page=self.page)
+
+        # process the task goal
+
+        # no goal specified
+        if task_goal is None:
+            self.goal_object = []
+        # convert text-only goal (legacy) to new format
+        elif isinstance(task_goal, str):
+            self.goal_object = [{"type": "text", "text": task_goal}]
+        # new format goal with multiple texts and images (OpenAI style)
+        elif isinstance(task_goal, list):
+            self.goal_object = task_goal
+        else:
+            raise ValueError(f"task_goal should be of type str or list, got {task_goal.__class__}")
 
         # initialize the chat
         self.chat.add_message(
             role="assistant",
             msg="Hi! I am your UI assistant, I can perform web tasks for you. What can I help you with?",
         )
-        # if any, add the task's goal to the chat
-        if goal:
-
-            # goal is text-only
-            if isinstance(goal, str):
-                goal_msg = goal
 
-            # goal is text + images
-            elif isinstance(goal, dict):
-                goal_msg = goal["message"]
-                for image_url in goal["image_urls"]:
-                    self.chat.add_message(role="user_image", msg=image_url)
-
-            self.chat.add_message(role="user", msg=goal_msg)
+        # send task goal (if any) to the chat
+        for message in self.goal_object:
+            match message["type"]:
+                case "text":
+                    self.chat.add_message(role="user", msg=message["text"])
+                case "image_url":
+                    image_src = message["image_url"]
+                    if isinstance(image_src, dict):
+                        image_src = image_src["url"]
+                    self.chat.add_message(role="user_image", msg=image_src)
+                case _:
+                    raise ValueError(
+                        f"Unknown message type {repr(message['type'])} in the task goal."
+                    )
 
         self._wait_dom_loaded()
 
@@ -508,26 +536,11 @@ def _get_obs(self):
         # post-extraction cleanup of temporary info in dom
         _post_extract(self.page)
 
-        # use first user message as goal, if any
-        # use all user images before first user message as goal images, if any
-        goal_msg = "There is no goal."
-        goal_image_urls = []
-        _prev_image_urls = []
-        for msg in self.chat.messages:
-            if msg["role"] == "user_image":
-                _prev_image_urls.append(msg["message"])
-            elif msg["role"] == "user":
-                goal_msg = msg["message"]
-                goal_image_urls = _prev_image_urls
-                break
-            else:
-                pass
-
         # obs is generic to all tasks
         obs = {
             "chat_messages": copy.deepcopy(self.chat.messages),
-            "goal": goal_msg,  # TODO: redundant with chat messages, to be removed?
-            "goal_image_urls": goal_image_urls,  # TODO: redundant with chat messages, to be removed?
+            "goal": _try_to_extract_legacy_goal(self.goal_object),  # legacy goal, deprecated
+            "goal_object": self.goal_object,  # new goal format, list of messages openai style
             "open_pages_urls": [page.url for page in self.context.pages],
             "active_page_index": np.asarray([self.context.pages.index(self.page)]),
             "url": self.page.url,

diff --git a/browsergym/core/src/browsergym/core/registration.py b/browsergym/core/src/browsergym/core/registration.py
@@ -6,7 +6,12 @@
 
 
 def register_task(
-    id: str, task_class: Type[AbstractBrowserTask], nondeterministic: bool = True, *args, **kwargs
+    id: str,
+    task_class: Type[AbstractBrowserTask],
+    task_kwargs: dict = None,
+    nondeterministic: bool = True,
+    *args,
+    **kwargs,
 ):
     """
     Registers a browser task as a gym environment with its unique id.
@@ -19,9 +24,16 @@ def register_task(
         *kwargs: additional arguments for the browsergym environment.
     """
 
+    # these environment arguments will be fixed, and error will be raised if they are set when calling gym.make()
+    fixed_env_kwargs = {}
+    if task_kwargs is not None:
+        fixed_env_kwargs["task_kwargs"] = task_kwargs
+
     gym.register(
         id=f"browsergym/{id}",
-        entry_point=lambda *env_args, **env_kwargs: BrowserEnv(task_class, *env_args, **env_kwargs),
+        entry_point=lambda *env_args, **env_kwargs: BrowserEnv(
+            task_class, *env_args, **fixed_env_kwargs, **env_kwargs
+        ),
         nondeterministic=nondeterministic,
         *args,
         **kwargs,

diff --git a/browsergym/core/src/browsergym/core/spaces.py b/browsergym/core/src/browsergym/core/spaces.py
@@ -79,6 +79,19 @@ def __eq__(self, other: Any) -> bool:
         return isinstance(other, AnyDict)
 
 
+class Anything(Space):
+    """A space representing an arbitrary dictionary object."""
+
+    def contains(self, x: Any) -> bool:
+        return True
+
+    def __repr__(self) -> str:
+        return f"Anything()"
+
+    def __eq__(self, other: Any) -> bool:
+        return isinstance(other, Anything)
+
+
 class AnyBox(Space[NDArray[Any]]):
     """A space representing an arbitrary dictionary object."""
 

diff --git a/browsergym/core/src/browsergym/core/task.py b/browsergym/core/src/browsergym/core/task.py
@@ -1,9 +1,9 @@
-import numpy as np
-import playwright.sync_api
-
 from abc import ABC, abstractmethod
 from typing import Tuple
 
+import numpy as np
+import playwright.sync_api
+
 
 class AbstractBrowserTask(ABC):
     """

diff --git a/browsergym/experiments/src/browsergym/experiments/loop.py b/browsergym/experiments/src/browsergym/experiments/loop.py
@@ -432,7 +432,18 @@ def save_step_info(self, exp_dir, save_json=False, save_screenshot=True, save_so
             img = Image.fromarray(screenshot_som)
             img.save(exp_dir / f"screenshot_som_step_{self.step}.png")
 
+        # save goal object (which might contain images) to a separate file to save space
+        if self.obs is not None and self.obs.get("goal_object", False):
+            # save the goal object only once (goal should never change once setup)
+            goal_object_file = Path(exp_dir) / "goal_object.pkl.gz"
+            if not goal_object_file.exists():
+                with gzip.open(goal_object_file, "wb") as f:
+                    pickle.dump(self.obs["goal_object"], f)
+            # set goal_object to a special placeholder value, which indicates it should be loaded from a separate file
+            self.obs["goal_object"] = None
+
         with gzip.open(exp_dir / f"step_{self.step}.pkl.gz", "wb") as f:
+            # TODO should we pop the screenshots too before this to save space ?
             pickle.dump(self, f)
 
         if save_json:
@@ -584,6 +595,16 @@ def get_step_info(self, step: int) -> StepInfo:
                     )
                 except FileNotFoundError:
                     pass
+        # if goal_object is set to None, it indicates it has been saved into a separate file
+        if (
+            self._steps_info[step].obs
+            and "goal_object" in self._steps_info[step].obs
+            and self._steps_info[step].obs["goal_object"] is None
+        ):
+            with gzip.open(self.exp_dir / "goal_object.pkl.gz", "rb") as f:
+                goal_object = pickle.load(f)
+                self._steps_info[step].obs["goal_object"] = goal_object
+
         return self._steps_info[step]
 
     @property

diff --git a/browsergym/visualwebarena/src/browsergym/visualwebarena/__init__.py b/browsergym/visualwebarena/src/browsergym/visualwebarena/__init__.py
@@ -13,7 +13,7 @@
     register_task(
         gym_id,
         task.GenericVisualWebArenaTask,
-        kwargs={"task_kwargs": {"task_id": task_id}},
+        task_kwargs={"task_id": task_id},
     )
     ALL_VISUALWEBARENA_TASK_IDS.append(gym_id)
     if task_id in config.TASK_IDS_WITH_RESET: