From 0191bfcdab84aee40d9fd7096df053c9d0478085 Mon Sep 17 00:00:00 2001
From: Maxime Gasse <maxime.gasse@gmail.com>
Date: Tue, 15 Oct 2024 16:00:56 -0400
Subject: [PATCH 01/10] new obs["open_pages_titles"]

---
 browsergym/core/src/browsergym/core/env.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/browsergym/core/src/browsergym/core/env.py b/browsergym/core/src/browsergym/core/env.py
index 22fffaaa..677bb19c 100644
--- a/browsergym/core/src/browsergym/core/env.py
+++ b/browsergym/core/src/browsergym/core/env.py
@@ -141,6 +141,9 @@ def __init__(
                 "open_pages_urls": gym.spaces.Sequence(
                     Unicode(min_length=0, max_length=TEXT_MAX_LENGTH)
                 ),
+                "open_pages_titles": gym.spaces.Sequence(
+                    Unicode(min_length=0, max_length=TEXT_MAX_LENGTH)
+                ),
                 "active_page_index": gym.spaces.Box(low=0, high=255, dtype=int),
                 "url": Unicode(min_length=0, max_length=TEXT_MAX_LENGTH),
                 "screenshot": AnyBox(
@@ -542,8 +545,9 @@ def _get_obs(self):
             "goal": _try_to_extract_legacy_goal(self.goal_object),  # legacy goal, deprecated
             "goal_object": self.goal_object,  # new goal format, list of messages openai style
             "open_pages_urls": [page.url for page in self.context.pages],
+            "open_pages_titles": [page.title() for page in self.context.pages],
             "active_page_index": np.asarray([self.context.pages.index(self.page)]),
-            "url": self.page.url,
+            "url": self.page.url,  # redundant with "open_pages_urls" and "active_page_index"
             "screenshot": extract_screenshot(self.page),
             "dom_object": dom,
             "axtree_object": axtree,

From 34a218229a448e2a6d46cc93039dc38e36ca4234 Mon Sep 17 00:00:00 2001
From: Maxime Gasse <maxime.gasse@gmail.com>
Date: Thu, 17 Oct 2024 16:41:59 -0400
Subject: [PATCH 02/10] wa / vwa url safeguard

---
 .../src/browsergym/visualwebarena/task.py     | 21 +++++++++++++-----
 .../webarena/src/browsergym/webarena/task.py  | 22 ++++++++++++++-----
 2 files changed, 33 insertions(+), 10 deletions(-)

diff --git a/browsergym/visualwebarena/src/browsergym/visualwebarena/task.py b/browsergym/visualwebarena/src/browsergym/visualwebarena/task.py
index 00a3107e..3e5290bb 100644
--- a/browsergym/visualwebarena/src/browsergym/visualwebarena/task.py
+++ b/browsergym/visualwebarena/src/browsergym/visualwebarena/task.py
@@ -1,13 +1,14 @@
+import importlib.resources
 import json
 import logging
-import playwright.sync_api
-import importlib.resources
 import pathlib
 import tempfile
-import requests
-
+import urllib.parse
 from typing import Optional, Tuple
 
+import playwright.sync_api
+import requests
+
 from browsergym.core.task import AbstractBrowserTask
 
 from .instance import VisualWebArenaInstance
@@ -226,7 +227,17 @@ def teardown(self) -> None:
     def validate(
         self, page: playwright.sync_api.Page, chat_messages: list[str]
     ) -> Tuple[float, bool, str, dict]:
-        # import webarena on instanciation
+        # check that all open tabs are either blank or within the list of WebArena URLs
+        authorized_locations = [
+            urllib.parse.urlparse(url).netloc
+            for url in [*self.webarena_instance.urls, self.webarena_instance.home_url]
+        ]
+        for open_page in page.context.pages:
+            page_location = urllib.parse.urlparse(open_page.url).netloc
+            if not page_location in authorized_locations:
+                return 0, True, "", {"error": "Unauthorized url, terminating task"}
+
+        # import webarena dynamically
         from visualwebarena.browser_env.actions import ActionTypes
 
         # if any, use the last assistant message as the stop answer for webarena
diff --git a/browsergym/webarena/src/browsergym/webarena/task.py b/browsergym/webarena/src/browsergym/webarena/task.py
index f1d5a59d..a7e3fe98 100644
--- a/browsergym/webarena/src/browsergym/webarena/task.py
+++ b/browsergym/webarena/src/browsergym/webarena/task.py
@@ -1,12 +1,13 @@
+import importlib.resources
 import json
 import logging
-import numpy as np
-import playwright.sync_api
-import importlib.resources
 import tempfile
-
+import urllib.parse
 from typing import Optional, Tuple
 
+import numpy as np
+import playwright.sync_api
+
 from browsergym.core.task import AbstractBrowserTask
 
 from .instance import WebArenaInstance
@@ -154,7 +155,18 @@ def teardown(self) -> None:
     def validate(
         self, page: playwright.sync_api.Page, chat_messages: list[str]
     ) -> Tuple[float, bool, str, dict]:
-        # import webarena on instanciation
+
+        # check that all open tabs are either blank or within the list of WebArena URLs
+        authorized_locations = [
+            urllib.parse.urlparse(url).netloc
+            for url in [*self.webarena_instance.urls, self.webarena_instance.home_url]
+        ]
+        for open_page in page.context.pages:
+            page_location = urllib.parse.urlparse(open_page.url).netloc
+            if not page_location in authorized_locations:
+                return 0, True, "", {"error": "Unauthorized url, terminating task"}
+
+        # import webarena dynamically
         from webarena.browser_env.actions import ActionTypes
 
         # if any, use the last assistant message as the stop answer for webarena

From 6375634ebe4b8a6209c9bbb6cea1d09c180a1553 Mon Sep 17 00:00:00 2001
From: Maxime Gasse <maxime.gasse@gmail.com>
Date: Thu, 17 Oct 2024 16:42:14 -0400
Subject: [PATCH 03/10] basic_agent multi-tab update

---
 demo_agent/basic_agent.py | 35 +++++++++++++++++++++++++++++++----
 1 file changed, 31 insertions(+), 4 deletions(-)

diff --git a/demo_agent/basic_agent.py b/demo_agent/basic_agent.py
index 59ad59e8..e6515c4d 100644
--- a/demo_agent/basic_agent.py
+++ b/demo_agent/basic_agent.py
@@ -1,14 +1,14 @@
 import base64
 import dataclasses
-import numpy as np
 import io
 import logging
 
+import numpy as np
 from PIL import Image
 
-from browsergym.experiments import Agent, AbstractAgentArgs
 from browsergym.core.action.highlevel import HighLevelActionSet
 from browsergym.core.action.python import PythonActionSet
+from browsergym.experiments import AbstractAgentArgs, Agent
 from browsergym.utils.obs import flatten_axtree_to_str, flatten_dom_to_str, prune_html
 
 logger = logging.getLogger(__name__)
@@ -40,6 +40,9 @@ def obs_preprocessor(self, obs: dict) -> dict:
             "goal_object": obs["goal_object"],
             "last_action": obs["last_action"],
             "last_action_error": obs["last_action_error"],
+            "open_pages_urls": obs["open_pages_urls"],
+            "open_pages_titles": obs["open_pages_titles"],
+            "active_page_index": obs["active_page_index"],
             "axtree_txt": flatten_axtree_to_str(obs["axtree_object"]),
             "pruned_html": prune_html(flatten_dom_to_str(obs["dom_object"])),
         }
@@ -68,7 +71,7 @@ def __init__(
         self.openai_client = OpenAI()
 
         self.action_set = HighLevelActionSet(
-            subsets=["chat", "bid", "infeas"],  # define a subset of the action space
+            subsets=["chat", "tab", "nav", "bid", "infeas"],  # define a subset of the action space
             # subsets=["chat", "bid", "coord", "infeas"] # allow the agent to also use x,y coordinates
             strict=False,  # less strict on the parsing of the actions
             multiaction=False,  # does not enable the agent to take multiple actions at once
@@ -151,6 +154,29 @@ def get_action(self, obs: dict) -> tuple[str, dict]:
             # goal_object is directly presented as a list of openai-style messages
             user_msgs.extend(obs["goal_object"])
 
+        # append url of all open tabs
+        user_msgs.append(
+            {
+                "type": "text",
+                "text": f"""\
+# Currently open tabs
+""",
+            }
+        )
+        for page_index, (page_url, page_title) in enumerate(
+            zip(obs["open_pages_urls"], obs["open_pages_titles"])
+        ):
+            user_msgs.append(
+                {
+                    "type": "text",
+                    "text": f"""\
+Tab {page_index}{" (active tab)" if page_index == obs["active_page_index"] else ""}
+  Title: {page_title}
+  URL: {page_url}
+""",
+                }
+            )
+
         # append page AXTree (if asked)
         if self.use_axtree:
             user_msgs.append(
@@ -234,6 +260,7 @@ def get_action(self, obs: dict) -> tuple[str, dict]:
                     {
                         "type": "text",
                         "text": f"""\
+
 {action}
 """,
                     }
@@ -261,7 +288,7 @@ def get_action(self, obs: dict) -> tuple[str, dict]:
                 "text": f"""\
 # Next action
 
-You will now think step by step and produce your next best action. Reflect on your past actions, any resulting error message, the current state of the page before deciding on your next action.
+You will now think step by step and produce your next best action. Reflect on your past actions, any resulting error message, and the current state of the page before deciding on your next action.
 """,
             }
         )

From 9e8943afaf316a8fb5b8eb314ccccadde7d2f076 Mon Sep 17 00:00:00 2001
From: Maxime Gasse <maxime.gasse@gmail.com>
Date: Thu, 17 Oct 2024 17:28:00 -0400
Subject: [PATCH 04/10] ci tests + fixes

---
 .../src/browsergym/core/action/functions.py   |  33 +-
 .../src/browsergym/visualwebarena/task.py     |   7 +-
 .../webarena/src/browsergym/webarena/task.py  |   6 +-
 tests/core/test_actions_highlevel.py          | 341 +++---------------
 .../test_vwa_tasks_without_reset.py           |  45 ++-
 5 files changed, 121 insertions(+), 311 deletions(-)

diff --git a/browsergym/core/src/browsergym/core/action/functions.py b/browsergym/core/src/browsergym/core/action/functions.py
index c9dfe1cb..b6ae9eb3 100644
--- a/browsergym/core/src/browsergym/core/action/functions.py
+++ b/browsergym/core/src/browsergym/core/action/functions.py
@@ -1,8 +1,9 @@
 # these are placeholders
 # all these symbols will be available in browsergym actions
-import playwright.sync_api
 from typing import Literal
 
+import playwright.sync_api
+
 from .utils import (
     add_demo_mode_effects,
     get_elem_by_bid,
@@ -527,7 +528,15 @@ def new_tab():
     # set the new page as the active page
     page = page.context.new_page()
     # trigger the callback that sets this page as active in browsergym
-    page.locate("html").dispatch_event("pageshow")
+    page.evaluate(
+        """\
+const event = new Event('pageshow', {
+    bubbles: true,  // Whether the event bubbles up through the DOM or not
+    cancelable: false  // Whether the event can be canceled
+});
+window.dispatchEvent(event);
+"""
+    )
 
 
 # https://playwright.dev/python/docs/api/class-page#page-close
@@ -548,7 +557,15 @@ def tab_close():
     else:
         page = context.new_page()
     # trigger the callback that sets this page as active in browsergym
-    page.locate("html").dispatch_event("pageshow")
+    page.evaluate(
+        """\
+const event = new Event('pageshow', {
+    bubbles: true,  // Whether the event bubbles up through the DOM or not
+    cancelable: false  // Whether the event can be canceled
+});
+window.dispatchEvent(event);
+"""
+    )
 
 
 # https://playwright.dev/python/docs/api/class-page#page-bring-to-front
@@ -562,7 +579,15 @@ def tab_focus(index: int):
     global page  # set the focused page as the active page
     page = page.context.pages[index]
     # trigger the callback that sets this page as active in browsergym
-    page.locate("html").dispatch_event("pageshow")
+    page.evaluate(
+        """\
+const event = new Event('pageshow', {
+    bubbles: true,  // Whether the event bubbles up through the DOM or not
+    cancelable: false  // Whether the event can be canceled
+});
+window.dispatchEvent(event);
+"""
+    )
 
 
 # https://playwright.dev/python/docs/input#upload-files
diff --git a/browsergym/visualwebarena/src/browsergym/visualwebarena/task.py b/browsergym/visualwebarena/src/browsergym/visualwebarena/task.py
index 3e5290bb..6032410b 100644
--- a/browsergym/visualwebarena/src/browsergym/visualwebarena/task.py
+++ b/browsergym/visualwebarena/src/browsergym/visualwebarena/task.py
@@ -227,10 +227,11 @@ def teardown(self) -> None:
     def validate(
         self, page: playwright.sync_api.Page, chat_messages: list[str]
     ) -> Tuple[float, bool, str, dict]:
-        # check that all open tabs are either blank or within the list of WebArena URLs
-        authorized_locations = [
+
+        # safeguard: check that all open tabs are either blank or within the list of WebArena URLs
+        authorized_locations = ["newtab", ""] + [
             urllib.parse.urlparse(url).netloc
-            for url in [*self.webarena_instance.urls, self.webarena_instance.home_url]
+            for url in [*self.webarena_instance.urls.values(), self.webarena_instance.home_url]
         ]
         for open_page in page.context.pages:
             page_location = urllib.parse.urlparse(open_page.url).netloc
diff --git a/browsergym/webarena/src/browsergym/webarena/task.py b/browsergym/webarena/src/browsergym/webarena/task.py
index a7e3fe98..90b63a83 100644
--- a/browsergym/webarena/src/browsergym/webarena/task.py
+++ b/browsergym/webarena/src/browsergym/webarena/task.py
@@ -156,10 +156,10 @@ def validate(
         self, page: playwright.sync_api.Page, chat_messages: list[str]
     ) -> Tuple[float, bool, str, dict]:
 
-        # check that all open tabs are either blank or within the list of WebArena URLs
-        authorized_locations = [
+        # safeguard: check that all open tabs are either blank or within the list of WebArena URLs
+        authorized_locations = ["newtab", ""] + [
             urllib.parse.urlparse(url).netloc
-            for url in [*self.webarena_instance.urls, self.webarena_instance.home_url]
+            for url in [*self.webarena_instance.urls.values(), self.webarena_instance.home_url]
         ]
         for open_page in page.context.pages:
             page_location = urllib.parse.urlparse(open_page.url).netloc
diff --git a/tests/core/test_actions_highlevel.py b/tests/core/test_actions_highlevel.py
index 93cbf5ee..9540c9c2 100644
--- a/tests/core/test_actions_highlevel.py
+++ b/tests/core/test_actions_highlevel.py
@@ -1,22 +1,20 @@
 import ast
-import bs4
-import gymnasium as gym
 import os
 import pathlib
 import platform
-import pytest
 import re
 
+import bs4
+import gymnasium as gym
+import pytest
 from pyparsing.exceptions import ParseException
 
 # register openended gym environments
 import browsergym.core
-
-from browsergym.utils.obs import flatten_dom_to_str
 from browsergym.core.action.highlevel import HighLevelActionSet
-from browsergym.core.action.parsers import highlevel_action_parser, NamedArgument
+from browsergym.core.action.parsers import NamedArgument, highlevel_action_parser
 from browsergym.core.constants import BROWSERGYM_ID_ATTRIBUTE as BID_ATTR
-
+from browsergym.utils.obs import flatten_dom_to_str
 
 _IS_MAC_OS = platform.system() == "Darwin"
 
@@ -1035,300 +1033,53 @@ def get_top_bottom_elems(obs):
     env.close()
 
 
-# def test_meta_action():
-#     env = BrowserEnv(
-#         task_entrypoint=OpenEndedTask,
-#         task_kwargs={"start_url": TEXT_INPUT_URL},
-#         headless=__HEADLESS__,
-#     )
-#     obs, info = env.reset()
-
-#     soup = bs4.BeautifulSoup(obs["html"], "lxml")
-#     fname = soup.find("input", attrs={"id": "fname"})
-#     lname = soup.find("input", attrs={"id": "lname"})
-
-#     # elementary action
-#     action = json.dumps({"action_type": "click", "x": 0, "y": 0})
-
-#     obs, reward, terminated, truncated, info = env.step(action)
-
-#     assert not obs["last_action_error"]
-
-#     # list of actions
-#     action = json.dumps(
-#         [{"action_type": "click", "x": 0, "y": 0}, {"action_type": "click", "x": 0, "y": 0}]
-#     )
-
-#     obs, reward, terminated, truncated, info = env.step(action)
-
-#     assert not obs["last_action_error"]
-
-#     # invalid action type
-#     action = json.dumps({"action_type": "clickk", "x": 0, "y": 0})
-
-#     obs, reward, terminated, truncated, info = env.step(action)
-
-#     assert obs["last_action_error"]
-#     assert "Invalid" in obs["error_logs"]
-
-#     # missing action type
-#     action = json.dumps({"x": 0, "y": 0})
-
-#     obs, reward, terminated, truncated, info = env.step(action)
-
-#     assert obs["last_action_error"]
-#     assert "Missing" in obs["error_logs"]
-
-#     # not JSON
-#     action = action_mapping.to_playwright_code("NOT_JSON"
-
-#     obs, reward, terminated, truncated, info = env.step(action)
-
-#     assert obs["last_action_error"]
-#     assert "JSONDecodeError" in obs["error_logs"]
-
-#     # empty action list
-#     action = json.dumps([])
-
-#     obs, reward, terminated, truncated, info = env.step(action)
-
-#     assert obs["last_action_error"]
-#     assert "Empty" in obs["error_logs"]
-
-
-# def test_input_type_number():
-#     env = BrowserEnv(
-#         task_entrypoint=GuessNumberTask,
-#         headless=__HEADLESS__,
-#     )
-#     obs, info = env.reset()
-
-#     soup = bs4.BeautifulSoup(obs["html"], "lxml")
-#     input_elem = soup.find("input", attrs={"type": "number"})
-#     input_bid = input_elem.get(BID_ATTR)
-#     input_value = input_elem.get("value")
-
-#     # type using bid
-#     action = json.dumps(
-#         {
-#             "action_type": "type",
-#             BID_ATTR: input_bid,
-#             "text": "6",
-#         }
-#     )
-#     obs, reward, terminated, truncated, info = env.step(action)
-
-#     soup = bs4.BeautifulSoup(obs["html"], "lxml")
-#     input_elem = soup.find("input", attrs={"type": "number"})
-#     input_bid = input_elem.get(BID_ATTR)
-#     input_value = input_elem.get("value")
-
-#     assert input_value == "6"
-
-#     action = json.dumps(
-#         {
-#             "action_type": "type",
-#             BID_ATTR: input_bid,
-#             "text": "7",
-#         }
-#     )
-#     obs, reward, terminated, truncated, info = env.step(action)
-
-#     soup = bs4.BeautifulSoup(obs["html"], "lxml")
-#     input_elem = soup.find("input", attrs={"type": "number"})
-#     input_bid = input_elem.get(BID_ATTR)
-#     input_value = input_elem.get("value")
-
-#     assert input_value == "7"
-
-
-# def test_auto_complete():
-#     env = BrowserEnv(
-#         task_entrypoint=BookFlightTask,
-#         headless=__HEADLESS__,
-#     )
-#     obs, info = env.reset()
-
-#     soup = bs4.BeautifulSoup(obs["html"], "lxml")
-
-#     # type using bid
-#     action = json.dumps(
-#         {
-#             "action_type": "type",
-#             BID_ATTR: "20",
-#             "text": "OGG",
-#         }
-#     )
-#     obs, reward, terminated, truncated, info = env.step(action)
-#     soup = bs4.BeautifulSoup(obs["html"], "lxml")
-#     # find element with bid="33"
-#     element = soup.find("ul", attrs={BID_ATTR: "33"})
-#     # extre the list li as python list
-#     # list_li = element.find_all("li")
-#     # assert len(list_li) > 0
-#     # assert list_li[0].text == "Kahului, HI - Island of Maui, (OGG)"
-
-
-# def test_clear_success():
-#     env = BrowserEnv(
-#         task_entrypoint=OpenEndedTask,
-#         task_kwargs={"start_url": TEXT_INPUT_URL},
-#         headless=__HEADLESS__,
-#     )
-#     obs, info = env.reset()
-
-#     fname_element = env.driver.find_element(By.CSS_SELECTOR, value="input[id='fname']")
-#     fname_element.send_keys("Christian")
-
-#     # clear using bid
-#     action = json.dumps(
-#         {
-#             "action_type": "clear",
-#             BID_ATTR: fname_element.get_attribute(BID_ATTR),
-#         }
-#     )
-
-#     assert fname_element.get_attribute("value") == "Christian"
-#     obs, reward, terminated, truncated, info = env.step(action)
-#     assert not obs["last_action_error"]
-#     assert fname_element.get_attribute("value") == ""
-
-#     fname_element.send_keys("Christian")
-#     # clear using bid
-#     action = json.dumps({"action_type": "clear", BID_ATTR: fname_element.get_attribute(BID_ATTR)})
-
-#     obs, reward, terminated, truncated, info = env.step(action)
-#     assert not obs["last_action_error"]
-#     assert fname_element.get_attribute("value") == ""
-
-#     fname_element.send_keys("Christian")
-#     # clear using css selector
-#     action = json.dumps({"action_type": "clear", "css_selector": "input[id='fname']"})
-
-#     obs, reward, terminated, truncated, info = env.step(action)
-#     assert not obs["last_action_error"]
-#     assert fname_element.get_attribute("value") == ""
-
-#     fname_element.send_keys("Christian")
-
-#     x, y = re.search(
-#         r"\[" + fname_element.get_attribute(BID_ATTR) + r"\] \(([-+]?[0-9\.]+), ([-+]?[0-9\.]+)\)",
-#         obs["accessibility_tree"],
-#     ).groups()
-
-#     # type at x, y coordinates
-#     action = json.dumps({"action_type": "clear", "x": x, "y": y})
-
-#     obs, reward, terminated, truncated, info = env.step(action)
-#     assert not obs["last_action_error"]
-#     assert fname_element.get_attribute("value") == ""
-
-#     fname_element.send_keys("Christian")
-
-#     # clear in currently focused element
-#     action = json.dumps({"action_type": "clear"})
-#     obs, reward, terminated, truncated, info = env.step(action)
-#     assert not obs["last_action_error"]
-#     assert fname_element.get_attribute("value") == ""
-
-#     # de-focus (click 0, 0), then type text
-#     action = json.dumps({"action_type": "clear", "x": 0, "y": 0})
-#     obs, reward, terminated, truncated, info = env.step(action)
-
-#     assert not obs["last_action_error"]
-
-
-# def test_clear_error():
-#     """In this test, we try to build a ClearAction but we use invalid args, and we check that the action fails when executed in the environment"""
-#     env = BrowserEnv(
-#         task_entrypoint=OpenEndedTask,
-#         task_kwargs={"start_url": TEXT_INPUT_URL},
-#         headless=__HEADLESS__,
-#     )
-#     obs, info = env.reset()
-
-#     soup = bs4.BeautifulSoup(obs["html"], "lxml")
-#     fname = soup.find("input", attrs={"id": "fname"})
-#     lname = soup.find("input", attrs={"id": "lname"})
-
-#     ################ 1. invalid args : both css_selector and bid mentionned ################
-#     action = json.dumps(
-#         {
-#             "action_type": "clear",
-#             BID_ATTR: fname.get(BID_ATTR),
-#             "css_selector": "input[id='fname']",
-#         }
-#     )
-
-#     obs, reward, terminated, truncated, info = env.step(action)
-
-#     assert obs["last_action_error"]
-
-#     soup = bs4.BeautifulSoup(obs["html"], "lxml")
-#     fname = soup.find("input", attrs={"id": "fname"})
-#     lname = soup.find("input", attrs={"id": "lname"})
-
-#     x, y = re.search(
-#         r"\[" + lname.get(BID_ATTR) + r"\] \(([-+]?[0-9\.]+), ([-+]?[0-9\.]+)\)",
-#         obs["accessibility_tree"],
-#     ).groups()
-
-#     ################ 2. invalid args : both bid and position mentionned ################
-
-#     action = json.dumps(
-#         {
-#             "action_type": "clear",
-#             BID_ATTR: lname.get(BID_ATTR),
-#             "x": x,
-#             "y": y,
-#         }
-#     )
-
-#     obs, reward, terminated, truncated, info = env.step(action)
-
-#     assert obs["last_action_error"]
-
-#     soup = bs4.BeautifulSoup(obs["html"], "lxml")
-#     fname = soup.find("input", attrs={"id": "fname"})
-#     lname = soup.find("input", attrs={"id": "lname"})
-
-#     x, y = re.search(
-#         r"\[" + lname.get(BID_ATTR) + r"\] \(([-+]?[0-9\.]+), ([-+]?[0-9\.]+)\)",
-#         obs["accessibility_tree"],
-#     ).groups()
-
-#     ################ 3. invalid args : both css_selector and position mentionned ################
-
-#     action = json.dumps(
-#         {
-#             "action_type": "clear",
-#             "css_selector": "input[id='lname']",
-#             "x": x,
-#             "y": y,
-#         }
-#     )
-
-#     obs, reward, terminated, truncated, info = env.step(action)
-
-#     assert obs["last_action_error"]
+def test_tab_actions():
+    action_set = HighLevelActionSet(subsets=["tab", "nav"])
 
+    env = gym.make(
+        "browsergym/openended",
+        task_kwargs={"start_url": CHECKBOX_URL},
+        headless=__HEADLESS,
+        slow_mo=__SLOW_MO,
+        timeout=__TIMEOUT,
+        action_mapping=action_set.to_python_code,
+    )
+    obs, info = env.reset()
+    assert not obs["last_action_error"]
+    assert len(obs["open_pages_urls"]) == 1
+    assert len(obs["open_pages_titles"]) == 1
+    assert obs["active_page_index"] == 0
+    assert obs["open_pages_urls"][obs["active_page_index"][0]] == obs["url"]
 
-@pytest.mark.skip(reason="Not implemented yet")
-def test_tab_focus():
-    # TODO
-    pass
+    obs, reward, terminated, truncated, info = env.step("new_tab()")
+    assert not obs["last_action_error"]
+    assert len(obs["open_pages_urls"]) == 2
+    assert len(obs["open_pages_titles"]) == 2
+    assert obs["active_page_index"] == 1
+    assert obs["open_pages_urls"][obs["active_page_index"][0]] == obs["url"]
 
+    obs, reward, terminated, truncated, info = env.step(f"goto({repr(TEXTBOX_URL)})")
+    assert not obs["last_action_error"]
+    assert len(obs["open_pages_urls"]) == 2
+    assert len(obs["open_pages_titles"]) == 2
+    assert obs["active_page_index"] == 1
+    assert obs["open_pages_urls"][obs["active_page_index"][0]] == obs["url"]
 
-@pytest.mark.skip(reason="Not implemented yet")
-def test_new_tab():
-    # TODO
-    pass
+    obs, reward, terminated, truncated, info = env.step("tab_focus(0)")
+    assert not obs["last_action_error"]
+    assert len(obs["open_pages_urls"]) == 2
+    assert len(obs["open_pages_titles"]) == 2
+    assert obs["active_page_index"] == 0
+    assert obs["open_pages_urls"][obs["active_page_index"][0]] == obs["url"]
 
+    obs, reward, terminated, truncated, info = env.step("tab_close()")
+    assert not obs["last_action_error"]
+    assert len(obs["open_pages_urls"]) == 1
+    assert len(obs["open_pages_titles"]) == 1
+    assert obs["active_page_index"] == 0
+    assert obs["open_pages_urls"][obs["active_page_index"][0]] == obs["url"]
 
-@pytest.mark.skip(reason="Not implemented yet")
-def test_tab_close():
-    # TODO
-    pass
+    env.close()
 
 
 def test_mouse_down_up():
diff --git a/tests/visualwebarena/test_vwa_tasks_without_reset.py b/tests/visualwebarena/test_vwa_tasks_without_reset.py
index 71896c9d..05d55856 100644
--- a/tests/visualwebarena/test_vwa_tasks_without_reset.py
+++ b/tests/visualwebarena/test_vwa_tasks_without_reset.py
@@ -1,23 +1,21 @@
-import gymnasium as gym
 import logging
 import os
-import playwright.sync_api
-import pytest
 import random
 
-from tenacity import retry, stop_after_attempt, retry_if_exception_type, wait_fixed
+import gymnasium as gym
+import playwright.sync_api
+import pytest
+from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_fixed
 
 # register gym environments
 import browsergym.visualwebarena
 
-
 __SLOW_MO = 1000 if "DISPLAY_BROWSER" in os.environ else None
 __HEADLESS = False if "DISPLAY_BROWSER" in os.environ else True
 
 
 from browsergym.visualwebarena import VISUALWEBARENA_TASK_IDS_WITHOUT_RESET
 
-
 rng = random.Random(1)
 task_ids = rng.sample(VISUALWEBARENA_TASK_IDS_WITHOUT_RESET, 25)
 print(task_ids)
@@ -40,3 +38,38 @@ def test_env_generic(task_id):
     )
     obs, info = env.reset()
     env.close()
+
+
+@retry(
+    stop=stop_after_attempt(5),
+    retry=retry_if_exception_type(playwright.sync_api.TimeoutError),
+    wait=wait_fixed(2),
+    reraise=True,
+    before_sleep=lambda _: logging.info("Retrying due to a TimeoutError..."),
+)
+def test_domain_safeguard():
+    env = gym.make(
+        f"browsergym/visualwebarena.398",
+        headless=__HEADLESS,
+        slow_mo=__SLOW_MO,
+    )
+    obs, info = env.reset()
+    assert not obs["last_action_error"]
+
+    obs, reward, terminated, truncated, info = env.step("new_tab()")
+    assert not obs["last_action_error"]
+    assert not (terminated or truncated)
+
+    obs, reward, terminated, truncated, info = env.step("tab_close()")
+    assert not obs["last_action_error"]
+    assert not (terminated or truncated)
+
+    obs, reward, terminated, truncated, info = env.step("tab_focus(0)")
+    assert not obs["last_action_error"]
+    assert not (terminated or truncated)
+
+    obs, reward, terminated, truncated, info = env.step('goto("http://www.google.com")')
+    assert not obs["last_action_error"]
+    assert terminated
+
+    env.close()

From 2b287b7abc956cfd09bbd248ac15136349c093c5 Mon Sep 17 00:00:00 2001
From: Maxime Gasse <maxime.gasse@gmail.com>
Date: Thu, 17 Oct 2024 17:28:33 -0400
Subject: [PATCH 05/10] gitignore

---
 .gitignore  |  2 ++
 sandbox.py  | 23 +++++++++++++++++++++++
 sandbox2.py | 22 ++++++++++++++++++++++
 sandbox3.py |  0
 4 files changed, 47 insertions(+)
 create mode 100644 sandbox.py
 create mode 100644 sandbox2.py
 create mode 100644 sandbox3.py

diff --git a/.gitignore b/.gitignore
index 4e091724..d6308cca 100644
--- a/.gitignore
+++ b/.gitignore
@@ -139,3 +139,5 @@ error_logs.txt
 tests/results
 tmp.py
 .vscode/settings.json
+
+results/
diff --git a/sandbox.py b/sandbox.py
new file mode 100644
index 00000000..acfa4bba
--- /dev/null
+++ b/sandbox.py
@@ -0,0 +1,23 @@
+from dataclasses import dataclass
+
+from dataclasses_json import DataClassJsonMixin
+
+
+@dataclass
+class Test(DataClassJsonMixin):
+    a: int
+    b: str
+
+    def do_something(self):
+        print(self.a, self.b)
+
+
+x: Test = Test(0, "hello")
+
+x_json = x.to_json()
+
+print(x_json)
+
+y = Test.from_json(x_json)
+
+y.do_something()
diff --git a/sandbox2.py b/sandbox2.py
new file mode 100644
index 00000000..5ead25f3
--- /dev/null
+++ b/sandbox2.py
@@ -0,0 +1,22 @@
+from browsergym.workarena import (
+    AGENT_CURRICULUM_L2,
+    AGENT_CURRICULUM_L3,
+    TASK_CATEGORY_MAP,
+)
+
+metadata = []
+
+for task_name, category in TASK_CATEGORY_MAP.items():
+    metadata.append((task_name, "l1", category))
+
+for category, items in AGENT_CURRICULUM_L2.items():
+    for task_set in items["buckets"]:
+        for task in task_set:
+            metadata.append((task.get_task_id(), "l2", category))
+
+for category, items in AGENT_CURRICULUM_L3.items():
+    for task_set in items["buckets"]:
+        for task in task_set:
+            metadata.append((task.get_task_id(), "l3", category))
+
+print("\n".join([",".join(x) for x in metadata]))
diff --git a/sandbox3.py b/sandbox3.py
new file mode 100644
index 00000000..e69de29b

From 78c09021c82c2448a302618e3c07dfaaa182b2aa Mon Sep 17 00:00:00 2001
From: Maxime Gasse <maxime.gasse@gmail.com>
Date: Thu, 17 Oct 2024 17:29:30 -0400
Subject: [PATCH 06/10] README update

---
 README.md | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index d711b045..b0121d5e 100644
--- a/README.md
+++ b/README.md
@@ -161,8 +161,7 @@ print("\n".join(env_ids))
 If you want to experiment with a demo agent in BrowserGym, follow these steps:
 
 ```sh
-cd demo-agent
-conda env create -f environment.yml
+conda env create -f demo-agent/environment.yml
 conda activate demo-agent
 # or simply use `pip install -r requirements.txt`
 playwright install chromium
@@ -172,27 +171,27 @@ Our demo agent uses `openai` as a backend, be sure to set your `OPENAI_API_KEY`.
 
 Launch the demo agent on the open web:
 ```sh
-python run_demo.py --task_name openended --start_url https://www.google.com
+python demo-agent/run_demo.py --task_name openended --start_url https://www.google.com
 ```
 
 Or use it to solve a simple MiniWoB task:
 ```sh
-python run_demo.py --task_name miniwob.click-test
+python demo-agent/run_demo.py --task_name miniwob.click-test
 ```
 
 A VisualWebArena task:
 ```sh
-python run_demo.py --task_name visualwebarena.398
+python demo-agent/run_demo.py --task_name visualwebarena.398
 ```
 
 A WebArena task:
 ```sh
-python run_demo.py --task_name webarena.4
+python demo-agent/run_demo.py --task_name webarena.4
 ```
 
 A WorkArena task:
 ```sh
-python run_demo.py --task_name workarena.servicenow.order-standard-laptop
+python demo-agent/run_demo.py --task_name workarena.servicenow.order-standard-laptop
 ```
 
 You can customize your experience by changing the `model_name` to your preferred LLM (it uses `gpt-4o-mini` by default), adding screenshots for your VLMs with `use_screenshot`, and much more! (see `python run_demo.py --help`)

From 55a9a05947f3878ca2ae07e2e33c4a72d7884a4c Mon Sep 17 00:00:00 2001
From: Maxime Gasse <maxime.gasse@gmail.com>
Date: Thu, 17 Oct 2024 17:34:18 -0400
Subject: [PATCH 07/10] README + demo_agent cleanup

---
 README.md                               | 14 +++++++-------
 demo_agent/{basic_agent.py => agent.py} |  5 ++---
 demo_agent/requirements.txt             | 16 ++--------------
 demo_agent/run_demo.py                  |  6 +++---
 4 files changed, 14 insertions(+), 27 deletions(-)
 rename demo_agent/{basic_agent.py => agent.py} (99%)

diff --git a/README.md b/README.md
index b0121d5e..fefb1473 100644
--- a/README.md
+++ b/README.md
@@ -161,8 +161,8 @@ print("\n".join(env_ids))
 If you want to experiment with a demo agent in BrowserGym, follow these steps:
 
 ```sh
-conda env create -f demo-agent/environment.yml
-conda activate demo-agent
+conda env create -f demo_agent/environment.yml
+conda activate demo_agent
 # or simply use `pip install -r requirements.txt`
 playwright install chromium
 ```
@@ -171,27 +171,27 @@ Our demo agent uses `openai` as a backend, be sure to set your `OPENAI_API_KEY`.
 
 Launch the demo agent on the open web:
 ```sh
-python demo-agent/run_demo.py --task_name openended --start_url https://www.google.com
+python demo_agent/run_demo.py --task_name openended --start_url https://www.google.com
 ```
 
 Or use it to solve a simple MiniWoB task:
 ```sh
-python demo-agent/run_demo.py --task_name miniwob.click-test
+python demo_agent/run_demo.py --task_name miniwob.click-test
 ```
 
 A VisualWebArena task:
 ```sh
-python demo-agent/run_demo.py --task_name visualwebarena.398
+python demo_agent/run_demo.py --task_name visualwebarena.398
 ```
 
 A WebArena task:
 ```sh
-python demo-agent/run_demo.py --task_name webarena.4
+python demo_agent/run_demo.py --task_name webarena.4
 ```
 
 A WorkArena task:
 ```sh
-python demo-agent/run_demo.py --task_name workarena.servicenow.order-standard-laptop
+python demo_agent/run_demo.py --task_name workarena.servicenow.order-standard-laptop
 ```
 
 You can customize your experience by changing the `model_name` to your preferred LLM (it uses `gpt-4o-mini` by default), adding screenshots for your VLMs with `use_screenshot`, and much more! (see `python run_demo.py --help`)
diff --git a/demo_agent/basic_agent.py b/demo_agent/agent.py
similarity index 99%
rename from demo_agent/basic_agent.py
rename to demo_agent/agent.py
index e6515c4d..632c0bbc 100644
--- a/demo_agent/basic_agent.py
+++ b/demo_agent/agent.py
@@ -4,6 +4,7 @@
 import logging
 
 import numpy as np
+import openai
 from PIL import Image
 
 from browsergym.core.action.highlevel import HighLevelActionSet
@@ -66,9 +67,7 @@ def __init__(
         if not (use_html or use_axtree):
             raise ValueError(f"Either use_html or use_axtree must be set to True.")
 
-        from openai import OpenAI
-
-        self.openai_client = OpenAI()
+        self.openai_client = openai.OpenAI()
 
         self.action_set = HighLevelActionSet(
             subsets=["chat", "tab", "nav", "bid", "infeas"],  # define a subset of the action space
diff --git a/demo_agent/requirements.txt b/demo_agent/requirements.txt
index b4614cce..a0fd3900 100644
--- a/demo_agent/requirements.txt
+++ b/demo_agent/requirements.txt
@@ -1,14 +1,2 @@
-browsergym-core>=0.3
-browsergym-experiments>=0.3
-openai>=1.35.4,<1.36
-langchain>=0.2,<0.3
-langchain_openai>=0.1.10,<0.2
-tiktoken
-huggingface_hub
-contexttimer
-ipython
-pyyaml>=6
-pandas
-joblib
-transformers
-langchain_community>=0.2.6,<0.3
+browsergym
+openai
diff --git a/demo_agent/run_demo.py b/demo_agent/run_demo.py
index a8702cd9..82cc2c96 100644
--- a/demo_agent/run_demo.py
+++ b/demo_agent/run_demo.py
@@ -1,11 +1,11 @@
 import argparse
 
+# locally defined agent
+from agent import DemoAgentArgs
+
 # browsergym experiments utils
 from browsergym.experiments import EnvArgs, ExpArgs, get_exp_result
 
-# locally defined agent
-from basic_agent import DemoAgentArgs
-
 
 def str2bool(v):
     if isinstance(v, bool):

From 9c5f700d938d6cc2f9655057c61312c89db1319f Mon Sep 17 00:00:00 2001
From: Maxime Gasse <maxime.gasse@gmail.com>
Date: Thu, 17 Oct 2024 17:38:13 -0400
Subject: [PATCH 08/10] rust dependency for tiktokken

---
 demo_agent/environment.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/demo_agent/environment.yml b/demo_agent/environment.yml
index a1a4f216..1104051c 100644
--- a/demo_agent/environment.yml
+++ b/demo_agent/environment.yml
@@ -8,6 +8,7 @@ channels:
 dependencies:
   - python>=3.10
   - pip
+  - rust
 
   - pip:
       - -r requirements.txt

From 4bef8be71bfce8f53e79e7b937745807c7c302b4 Mon Sep 17 00:00:00 2001
From: Maxime Gasse <maxime.gasse@gmail.com>
Date: Thu, 17 Oct 2024 17:46:45 -0400
Subject: [PATCH 09/10] remove rust dependency

---
 demo_agent/environment.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/demo_agent/environment.yml b/demo_agent/environment.yml
index 1104051c..a1a4f216 100644
--- a/demo_agent/environment.yml
+++ b/demo_agent/environment.yml
@@ -8,7 +8,6 @@ channels:
 dependencies:
   - python>=3.10
   - pip
-  - rust
 
   - pip:
       - -r requirements.txt

From afeac27aad89e3c83d1a58dc19eee10317770bf3 Mon Sep 17 00:00:00 2001
From: Maxime Gasse <maxime.gasse@gmail.com>
Date: Thu, 17 Oct 2024 17:47:05 -0400
Subject: [PATCH 10/10] tab_focus page.bring_to_front()

---
 browsergym/core/src/browsergym/core/action/functions.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/browsergym/core/src/browsergym/core/action/functions.py b/browsergym/core/src/browsergym/core/action/functions.py
index b6ae9eb3..222629b2 100644
--- a/browsergym/core/src/browsergym/core/action/functions.py
+++ b/browsergym/core/src/browsergym/core/action/functions.py
@@ -578,6 +578,7 @@ def tab_focus(index: int):
     """
     global page  # set the focused page as the active page
     page = page.context.pages[index]
+    page.bring_to_front()
     # trigger the callback that sets this page as active in browsergym
     page.evaluate(
         """\