ServiceNow · han032206 · Aug 23, 2024 · Aug 27, 2024 · Oct 13, 2024 · Oct 25, 2024
diff --git a/Makefile b/Makefile
@@ -1,6 +1,6 @@
 install:
 	@echo "--- 🚀 Installing project dependencies ---"
-	pip install -e ./browsergym/core -e ./browsergym/miniwob -e ./browsergym/webarena -e ./browsergym/visualwebarena/ -e ./browsergym/experiments -e ./browsergym/assistantbench -e ./browsergym/
+	pip install -e ./browsergym/core -e ./browsergym/miniwob -e ./browsergym/webarena -e ./browsergym/visualwebarena/ -e ./browsergym/experiments -e ./browsergym/assistantbench -e ./browsergym/webcanvas -e ./browsergym/
 	playwright install chromium
 
 install-demo:

diff --git a/README.md b/README.md
@@ -41,6 +41,7 @@ BrowserGym includes the following benchmarks by default:
  - [WebArena](https://webarena.dev/)
  - [VisualWebArena](https://jykoh.com/vwa)
  - [WorkArena](https://github.com/ServiceNow/WorkArena)
+ - [WebCanvas](https://github.com/iMeanAI/WebCanvas)
  - [AssistantBench](https://github.com/oriyor/assistantbench)
  - [WebLINX](https://github.com/McGill-NLP/weblinx) (static benchmark)
 
@@ -58,6 +59,7 @@ pip install browsergym-webarena  # core + webarena
 pip install browsergym-visualwebarena  # core + visualwebarena
 pip install browsergym-workarena  # core + workarena
 pip install browsergym-assistantbench  # core + assistantbench
+pip install browsergym-webcanvas # core + webcanvas
 pip install weblinx-browsergym  # core + weblinx
 ```
 
@@ -71,6 +73,7 @@ Finally, each benchmark comes with its own specific setup that requires to follo
  - for WebArena, see [webarena/README.md](browsergym/webarena/README.md)
  - for VisualWebArena, see [visualwebarena/README.md](browsergym/visualwebarena/README.md)
  - for WorkArena, see [WorkArena](https://github.com/ServiceNow/WorkArena)
+ - for WebCanvas, see [WebCanvas](https://github.com/iMeanAI/WebCanvas)
  - for AssistantBench, see [assistantbench/README.md](browsergym/assistantbench/README.md)
 
 ### 🏗️ Development setup
@@ -167,14 +170,28 @@ print("\n".join(env_ids))
 AssistantBench
 ```python
 import gymnasium as gym
-import browsergym.workarena  # register assistantbench tasks as gym environments
+import browsergym.assistantbench  # register assistantbench tasks as gym environments
 
 # start an assistantbench task
 env = gym.make("browsergym/assistantbench.validation.3")
 ...
 
 # list all the available assistantbench tasks
-env_ids = [id for id in gym.envs.registry.keys() if id.startswith("browsergym/workarena")]
+env_ids = [id for id in gym.envs.registry.keys() if id.startswith("browsergym/assistantbench")]
+print("\n".join(env_ids))
+```
+
+WebCanvas
+```python
+import gymnasium as gym
+import browsergym.webcanvas  # register webcanvas tasks as gym environments
+
+# start a webcanvas task
+env = gym.make("browsergym/webcanvas.mind2web-live.0")
+...
+
+# list all the available webcanvas tasks
+env_ids = [id for id in gym.envs.registry.keys() if id.startswith("browsergym/webcanvas")]
 print("\n".join(env_ids))
 ```
 
@@ -211,6 +228,9 @@ python demo_agent/run_demo.py --task_name webarena.4
 
 # visualwebarena
 python demo_agent/run_demo.py --task_name visualwebarena.398
+
+# webcanvas
+python demo_agent/run_demo.py --task_name webcanvas.mind2web-live.0
 ```
 
 You can customize your experience by changing the `model_name` to your preferred LLM (it uses `gpt-4o-mini` by default), adding screenshots for your VLMs with `use_screenshot`, and much more!
@@ -228,6 +248,7 @@ python demo_agent/run_demo.py --help
 - [MiniWoB(++)](https://miniwob.farama.org/): A collection of over 100 web tasks on synthetic web pages.
 - [WebLINX](https://github.com/McGill-NLP/weblinx): A dataset of real-world web interaction traces.
 - [AssistantBench](https://github.com/oriyor/assistantbench): A benchmark of realistic and time-consuming tasks on the open web.
+- [WebCanvas](https://github.com/iMeanAI/WebCanvas): Benchmarks of web tasks on live websites with key-node in progress evaluation.
 
 ## 🌟 Contributors
 

diff --git a/browsergym/core/src/browsergym/core/env.py b/browsergym/core/src/browsergym/core/env.py
@@ -9,6 +9,7 @@
 import gymnasium as gym
 import numpy as np
 import playwright.sync_api
+import json
 
 from . import _get_global_playwright
 from .action.base import execute_python_code
@@ -75,7 +76,8 @@ def __init__(
         pw_chromium_kwargs: dict = {},
         pw_context_kwargs: dict = {},
         # agent-related arguments
-        action_mapping: Optional[callable] = HighLevelActionSet().to_python_code,
+        action_mapping: Optional[callable] = HighLevelActionSet(
+        ).to_python_code,
     ):
         """
         Instantiate a ready to use BrowserEnv gym environment.
@@ -246,7 +248,8 @@ def override_property(task, env, property):
             no_viewport=True if self.resizeable_window else None,
             viewport=viewport if not self.resizeable_window else None,
             record_video_dir=(
-                Path(self.record_video_dir) / "task_video" if self.record_video_dir else None
+                Path(self.record_video_dir) /
+                "task_video" if self.record_video_dir else None
             ),
             record_video_size=viewport,
             locale=locale,
@@ -262,8 +265,10 @@ def override_property(task, env, property):
         # there is no concept of active page in playwright
         # https://github.com/microsoft/playwright/issues/2603
         self.context.expose_binding(
-            "browsergym_page_activated", lambda source: self._activate_page_from_js(source["page"])
+            "browsergym_page_activated", lambda source: self._activate_page_from_js(
+                source["page"])
         )
+
         self.context.add_init_script(
             r"""
 window.browsergym_page_activated();
@@ -406,15 +411,18 @@ def report_infeasible_instructions(reason: str):
             self.last_action_error = ""
         except Exception as e:
             self.last_action_error = f"{type(e).__name__}: {e}"
-            match = re.match("TimeoutError: Timeout ([0-9]+)ms exceeded.", self.last_action_error)
+            match = re.match(
+                "TimeoutError: Timeout ([0-9]+)ms exceeded.", self.last_action_error)
             if match:
-                info["action_exec_timeout"] = float(match.groups()[0]) / 1000  # ms to sec
+                info["action_exec_timeout"] = float(
+                    match.groups()[0]) / 1000  # ms to sec
         logger.debug(f"Action executed")
         info["action_exec_stop"] = time.time()
 
         # wait a bit (for the JavaScript callback to set the active page)
         time.sleep(0.5)  # wait for JS events to be fired (half a second)
-        self.context.cookies()  # trigger all waiting Playwright callbacks on the stack (hack, see https://playwright.dev/java/docs/multithreading)
+        # trigger all waiting Playwright callbacks on the stack (hack, see https://playwright.dev/java/docs/multithreading)
+        self.context.cookies()
 
         # wait for the network to idle before extracting the observation, reward etc.
         self._wait_dom_loaded()
@@ -455,8 +463,8 @@ def _task_validate(self):
         prev_active_page = self.page
         prev_page_history = self.page_history.copy()
         # call validate
-        reward, done, user_message, info = self.task.validate(self.page, self.chat.messages)
-
+        reward, done, user_message, info = self.task.validate(
+            self.page, self.chat.messages, self.last_action)
         # safety fix, in case validate() did mess up the active page and/or page history
         if prev_active_page != self.page or prev_page_history != self.page_history:
             logger.debug(
@@ -498,7 +506,8 @@ def _activate_page_from_js(self, page: playwright.sync_api.Page):
                 page
             )  # move page to the end of dictionnary
         else:
-            self.page_history[page] = None  # add page to the end of dictionnary
+            # add page to the end of dictionnary
+            self.page_history[page] = None
 
         self.page = page
 
@@ -524,7 +533,8 @@ def _active_page_check(self):
 
         # active page should not be closed
         if self.page.is_closed():
-            raise RuntimeError(f"Unexpected: active page has been closed ({self.page}).")
+            raise RuntimeError(
+                f"Unexpected: active page has been closed ({self.page}).")
 
     def _get_obs(self):
 
@@ -583,4 +593,4 @@ def _get_obs(self):
             "elapsed_time": np.asarray([time.time() - self.start_time]),
         }
 
-        return obs
+        return obs
diff --git a/browsergym/experiments/src/browsergym/experiments/loop.py b/browsergym/experiments/src/browsergym/experiments/loop.py
@@ -931,6 +931,8 @@ def _get_env_name(task_name: str):
         import browsergym.webarena
     elif task_name.startswith("visualwebarena"):
         import browsergym.visualwebarena
+    elif task_name.startswith("webcanvas"):
+        import browsergym.webcanvas
     elif task_name.startswith("assistantbench"):
         import browsergym.assistantbench
     elif task_name.startswith("weblinx"):

diff --git a/browsergym/webcanvas/README.md b/browsergym/webcanvas/README.md
@@ -0,0 +1,23 @@
+# WebCanvas Environment
+
+## Installation
+
+1. Install the package
+```sh
+pip install browsergym-webcanvas
+```
+2. Setup an OpenAI API key
+
+```sh
+export OPENAI_API_KEY=...
+```
+
+3. Download the dataset
+   - Option 1: Download from HuggingFace
+     Visit [Mind2Web-Live Dataset](https://huggingface.co/datasets/iMeanAI/Mind2Web-Live) and download the latest dataset.
+
+   - Option 2: Download from WebCanvas Platform
+     Visit [WebCanvas Repository](https://github.com/iMeanAI/WebCanvas) and follow the instructions to download the latest dataset.
+
+4. Place the dataset
+   - Put the downloaded JSON file into `./src/browsergym/webcanvas/data/`
diff --git a/browsergym/webcanvas/pyproject.toml b/browsergym/webcanvas/pyproject.toml
@@ -0,0 +1,35 @@
+[build-system]
+requires = ["hatchling", "hatch-requirements-txt"]
+build-backend = "hatchling.build"
+
+[project]
+name = "browsergym-webcanvas"
+description = "WebCanvas benchmark for BrowserGym"
+authors = [
+    {name = "Sida Zhou"},
+    {name = "Dehan Kong"},
+]
+readme = "README.md"
+requires-python = ">3.7"
+license = {text = "Apache-2.0"}
+classifiers = [
+    "Development Status :: 2 - Pre-Alpha",
+    "Programming Language :: Python :: 3",
+    "Operating System :: OS Independent",
+    "Intended Audience :: Science/Research",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "License :: OSI Approved :: Apache Software License",
+]
+dynamic = ["dependencies", "version"]
+
+[project.urls]
+homepage = "https://github.com/ServiceNow/BrowserGym"
+
+[tool.hatch.version]
+path = "../core/src/browsergym/core/__init__.py"
+
+[tool.hatch.metadata.hooks.requirements_txt]
+files = ["requirements.txt"]
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/browsergym"]
diff --git a/browsergym/webcanvas/requirements.txt b/browsergym/webcanvas/requirements.txt
@@ -0,0 +1,4 @@
+browsergym-core==0.13.3
+openai
+bs4
+lxml
diff --git a/browsergym/webcanvas/src/browsergym/webcanvas/__init__.py b/browsergym/webcanvas/src/browsergym/webcanvas/__init__.py
@@ -0,0 +1,15 @@
+from browsergym.core.registration import register_task
+
+# register the WebCanvas benchmark
+from . import config, task
+
+ALL_WEBCANVAS_TASK_IDS = []
+
+for task_id in config.TASK_TRAIN_IDS:
+    gym_id = f"webcanvas.mind2web-live.{task_id}"
+    register_task(
+        gym_id,
+        task.GenericWebCanvasTask,
+        kwargs={"task_kwargs": {"task_id": task_id}},
+    )
+    ALL_WEBCANVAS_TASK_IDS.append(gym_id)
diff --git a/browsergym/webcanvas/src/browsergym/webcanvas/config.py b/browsergym/webcanvas/src/browsergym/webcanvas/config.py
@@ -0,0 +1,2 @@
+TASK_TEST_IDS = range(104)
+TASK_TRAIN_IDS = range(130)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		TASK_TEST_IDS = range(104)
		TASK_TRAIN_IDS = range(130)