Skip to content

Commit

Permalink
Merge branch 'main' into capture-url-bug
Browse files Browse the repository at this point in the history
  • Loading branch information
mohamedmamdouh22 committed Jan 21, 2025
2 parents c15125f + 9c0aa59 commit 7d3693c
Show file tree
Hide file tree
Showing 9 changed files with 21 additions and 16 deletions.
4 changes: 4 additions & 0 deletions core/harambe_core/errors.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
from typing import Any


async def default_error_callback(url: str, status: int, *args):
raise GotoError(url, status)


class HarambeException(Exception):
"""Base exception for all custom exceptions in Harambe."""

Expand Down
2 changes: 1 addition & 1 deletion core/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "harambe-core"
version = "0.59.3"
version = "0.59.5"
description = "Core types for harambe SDK 🐒🍌"
authors = [
{ name = "Adam Watkins", email = "[email protected]" }
Expand Down
2 changes: 1 addition & 1 deletion core/uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions sdk/harambe/contrib/soup/impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ async def goto(self, url: str, **kwargs: Any) -> ResponseWithStatus:

class SoupResponseWithStatus:
status: int = res.status_code
headers: dict[str, str] = res.headers

return SoupResponseWithStatus()

Expand Down
1 change: 1 addition & 0 deletions sdk/harambe/contrib/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ class ResponseWithStatus(Protocol):
"""Protocol for goto responses across all harnesses. Use minimal attributes required for current use cases."""

status: int
headers: dict[str, str]


class AbstractPage(Selectable[T], abc.ABC):
Expand Down
15 changes: 7 additions & 8 deletions sdk/harambe/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
LocalStorage,
)
from harambe_core import SchemaParser, Schema
from harambe_core.errors import GotoError
from harambe_core.errors import default_error_callback
from harambe_core.normalize_url import normalize_url
from harambe_core.parser.expression import ExpressionEvaluator
from playwright.async_api import (
Expand All @@ -64,10 +64,6 @@
from harambe.contrib import WebHarness, playwright_harness


async def default_callback(url: str, status: int):
raise GotoError(url, status)


class AsyncScraper(Protocol):
"""
Protocol that all classed based scrapers should implement.
Expand Down Expand Up @@ -243,13 +239,14 @@ async def capture_download(
clickable: ElementHandle,
override_filename: str | None = None,
override_url: str | None = None,
timeout: float | None = None,
) -> DownloadMeta:
"""
Capture the download of a click event. This will click the element, download the resulting file
and apply some download handling logic from the observer to transform to a usable URL
"""

async with self.page.expect_download() as download_info:
async with self.page.expect_download(timeout=timeout) as download_info:
await clickable.click()
download = await download_info.value

Expand Down Expand Up @@ -457,7 +454,9 @@ async def run(
harness: WebHarness = playwright_harness,
evaluator: Optional[ExpressionEvaluator] = None,
observer: Optional[OutputObserver | List[OutputObserver]] = None,
callback: Callable[[str, int], Awaitable[None]] = default_callback,
goto_error_handler: Callable[
[str, int, dict[str, str]], Awaitable[None]
] = default_error_callback,
**harness_options: Unpack[HarnessOptions],
) -> "SDK":
"""
Expand Down Expand Up @@ -501,7 +500,7 @@ async def run(
if not harness_options.get("disable_go_to_url", False):
response = await page.goto(url)
if response.status >= 400:
await callback(url, response.status)
await goto_error_handler(url, response.status, response.headers)
elif isinstance(page, SoupPage):
page.url = url
await scraper(sdk, url, context)
Expand Down
4 changes: 2 additions & 2 deletions sdk/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
[project]
name = "harambe-sdk"
version = "0.59.3"
version = "0.59.5"
description = "Data extraction SDK for Playwright 🐒🍌"
authors = [
{ name = "Adam Watkins", email = "[email protected]" }
]
requires-python = ">=3.11,<4.0"
readme = "README.md"
dependencies = [
"harambe_core==0.59.3",
"harambe_core==0.59.5",
"playwright==1.47.0",
"beautifulsoup4==4.12.3",
"requests==2.32.3",
Expand Down
4 changes: 2 additions & 2 deletions sdk/test/test_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -687,7 +687,7 @@ async def test_403_status_on_goto_with_custom_callback(
async def scrape(sdk: SDK, current_url, context) -> None:
await sdk.save_data({"key": "this shouldn't be saved if GotoError is raised"})

async def custom_error_handler(url, status_code):
async def custom_error_handler(url, status_code, *args):
print(f"Handled {status_code} for {url} gracefully.")

error_callback = custom_error_handler
Expand All @@ -698,7 +698,7 @@ async def custom_error_handler(url, status_code):
schema={},
context={"status": "Open"},
observer=observer,
callback=error_callback,
goto_error_handler=error_callback,
)

# Ensure data is saved when error is handled (either with custom or no callback)
Expand Down
4 changes: 2 additions & 2 deletions sdk/uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 7d3693c

Please sign in to comment.