Skip to content

Commit

Permalink
Add dynamic waiting mechanism for new page openings in capture_url
Browse files Browse the repository at this point in the history
…method (#108)

* Implement dynamic waiting mechanism for new page openings withing a timeout

* formatting

* Refactor capture_url method to include timeout parameter

* Remove 'baboon/' from .gitignore

* Refactor _wait_for_new_page method to include timeout parameter

* Refactor timeout parameter in _wait_for_new_page method to use milliseconds instead of seconds

* Refactor timeout parameter in _wait_for_new_page method to use milliseconds instead of seconds
  • Loading branch information
mohamedmamdouh22 authored Jan 22, 2025
1 parent 9c0aa59 commit c007d42
Show file tree
Hide file tree
Showing 6 changed files with 33 additions and 13 deletions.
2 changes: 1 addition & 1 deletion core/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "harambe-core"
version = "0.59.4"
version = "0.59.5"
description = "Core types for harambe SDK 🐒🍌"
authors = [
{ name = "Adam Watkins", email = "[email protected]" }
Expand Down
2 changes: 1 addition & 1 deletion core/uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 6 additions & 2 deletions sdk/harambe/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,10 @@ async def paginate(
return

async def capture_url(
self, clickable: ElementHandle, resource_type: ResourceType = "document"
self,
clickable: ElementHandle,
resource_type: ResourceType = "document",
timeout: Optional[int] = 10000,
) -> URL | None:
"""
Capture the url of a click event. This will click the element and return the url
Expand All @@ -220,11 +223,12 @@ async def capture_url(
:param clickable: the element to click
:param resource_type: the type of resource to capture
:param timeout: the time to wait for the new page to open (in ms)
:return url: the url of the captured resource or None if no match was found
:raises ValueError: if more than one request matches
"""
async with ResourceRequestHandler(
self.page, resource_type=resource_type
self.page, resource_type=resource_type, timeout=timeout
) as handler:
await clickable.click()

Expand Down
26 changes: 21 additions & 5 deletions sdk/harambe/handlers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import re
import base64
import time
from abc import ABC
from typing import Any, Literal, Self

Expand Down Expand Up @@ -37,12 +38,13 @@ def __init__(
self,
page: Page,
resource_type: ResourceType,
timeout: int,
url_pattern: str = "**/*",
):
self.page = page
self.url_pattern = url_pattern
self.resource_type = resource_type

self.timeout = timeout
self._initial_pages = [p.url for p in page.context.pages]
self._new_pages: list[str] = []

Expand All @@ -53,10 +55,24 @@ async def __aenter__(self) -> Self:
async def __aexit__(self, *_: Any, **__: Any) -> None:
await self.page.context.unroute(self.url_pattern, self.handle)
await self.page.bring_to_front()
for page in self.page.context.pages:
if page.url not in self._initial_pages:
self._new_pages.append(page.url)
await page.close()
try:
new_page = await self._wait_for_new_page()
self._new_pages.append(new_page.url)
await new_page.close()
except TimeoutError:
raise TimeoutError(
f"No new page opened within the {self.timeout} ms timeout."
)

async def _wait_for_new_page(self) -> Page:
start_time = time.monotonic()
timeout_seconds = self.timeout / 1000
while time.monotonic() - start_time < timeout_seconds:
for page in self.page.context.pages:
if page.url not in self._initial_pages:
return page
await self.page.wait_for_timeout(100)
raise TimeoutError("Timed out waiting for a new page to open.")

async def handle(self, route: Route) -> None:
if (
Expand Down
4 changes: 2 additions & 2 deletions sdk/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
[project]
name = "harambe-sdk"
version = "0.59.4"
version = "0.59.5"
description = "Data extraction SDK for Playwright 🐒🍌"
authors = [
{ name = "Adam Watkins", email = "[email protected]" }
]
requires-python = ">=3.11,<4.0"
readme = "README.md"
dependencies = [
"harambe_core==0.59.4",
"harambe_core==0.59.5",
"playwright==1.47.0",
"beautifulsoup4==4.12.3",
"requests==2.32.3",
Expand Down
4 changes: 2 additions & 2 deletions sdk/uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit c007d42

Please sign in to comment.