Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🔥 BS4 Bindings #36

Merged
merged 6 commits into from
Jul 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions harambe/contrib/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from .types import WebHarness
from .soup import soup_harness
from .playwright import playwright_harness

__all__ = ["WebHarness", "soup_harness", "playwright_harness"]
3 changes: 3 additions & 0 deletions harambe/contrib/playwright/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .harness import playwright_harness

__all__ = ["playwright_harness"]
34 changes: 4 additions & 30 deletions harambe/harness.py → harambe/contrib/playwright/harness.py
Original file line number Diff line number Diff line change
@@ -1,46 +1,20 @@
from contextlib import asynccontextmanager
from typing import AsyncGenerator
from urllib.parse import urlparse

from playwright.async_api import async_playwright, Page, ProxySettings
from playwright.async_api import async_playwright
from playwright_stealth import stealth_async

from harambe.contrib.playwright.impl import PlaywrightPage
from harambe.handlers import UnnecessaryResourceHandler


def proxy_from_url(url: str) -> ProxySettings:
parsed = urlparse(url, allow_fragments=False)

if not parsed.hostname:
parsed = urlparse(f"http://{url}", allow_fragments=False)

if not all(
[
parsed.hostname,
parsed.username,
parsed.password,
]
):
raise ValueError(f"Invalid proxy URL: {url}")

proxy: ProxySettings = {
"server": parsed.hostname,
"username": parsed.username,
"password": parsed.password,
}

if parsed.port:
proxy["server"] += f":{parsed.port}"

return proxy
from harambe.proxy import proxy_from_url


@asynccontextmanager
async def playwright_harness(
headless: bool,
cdp_endpoint: str | None,
proxy: str | None = None,
) -> AsyncGenerator[Page, None]:
) -> AsyncGenerator[PlaywrightPage, None]:
"""
Context manager for Playwright. Starts a new browser, context, and page, and closes them when done.
Also does some basic setup like setting the viewport, user agent, ignoring HTTPS errors, creation of HAR file, and stealth.
Expand Down
11 changes: 11 additions & 0 deletions harambe/contrib/playwright/impl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from playwright.async_api import ElementHandle, Page

from harambe.contrib.types import AbstractElementHandle, AbstractPage


class PlaywrightElementHandle(ElementHandle, AbstractElementHandle):
pass


class PlaywrightPage(Page, AbstractPage[PlaywrightElementHandle]):
pass
3 changes: 3 additions & 0 deletions harambe/contrib/soup/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .harness import soup_harness

__all__ = ["soup_harness"]
19 changes: 19 additions & 0 deletions harambe/contrib/soup/harness.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from contextlib import asynccontextmanager
from typing import AsyncGenerator

from curl_cffi.requests import AsyncSession

from harambe.contrib.soup.impl import SoupPage


@asynccontextmanager
async def soup_harness(
headless: bool,
cdp_endpoint: str | None,
proxy: str | None = None,
) -> AsyncGenerator[SoupPage, None]:
if cdp_endpoint:
raise ValueError("CDP endpoint is not supported for Soup")

async with AsyncSession(proxy=proxy) as s:
yield SoupPage(s)
65 changes: 65 additions & 0 deletions harambe/contrib/soup/impl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
from typing import Any

from bs4 import Tag, BeautifulSoup
from curl_cffi.requests import AsyncSession

from harambe.contrib.types import AbstractElementHandle, Selectable, AbstractPage


class SoupElementHandle(AbstractElementHandle, Selectable["SoupElementHandle"]):
def __init__(self, tag: Tag) -> None:
self._tag = tag

@classmethod
def from_tags(cls, tag: list[Tag]) -> list["SoupElementHandle"]:
return [cls(t) for t in tag]

async def inner_text(self) -> str:
return self._tag.get_text()

async def get_attribute(self, name: str) -> str:
return self._tag.get(name)

async def query_selector_all(self, selector: str) -> list["SoupElementHandle"]:
return self.from_tags(self._tag.select(selector))

async def query_selector(self, selector: str) -> "SoupElementHandle":
return SoupElementHandle(self._tag.select_one(selector))

async def click(self) -> None:
raise NotImplementedError()

async def wait_for_selector(self, selector: str, **kwargs: Any) -> None:
pass


class SoupPage(AbstractPage[SoupElementHandle]):
_soup: BeautifulSoup
_url: str

def __init__(self, session: AsyncSession) -> None:
self._session = session

@property
def url(self) -> str:
return self._url

async def goto(self, url: str) -> None:
res = await self._session.get(url)
self._url = res.url
self._soup = BeautifulSoup(res.text, "html.parser")

async def query_selector_all(self, selector: str) -> list[SoupElementHandle]:
return SoupElementHandle.from_tags(self._soup.select(selector))

async def query_selector(self, selector: str) -> SoupElementHandle:
return SoupElementHandle(self._soup.select_one(selector))

async def wait_for_timeout(self, timeout: int) -> None:
pass

async def content(self) -> str:
return str(self._soup)

async def wait_for_selector(self, selector: str, **kwargs: Any) -> None:
pass
57 changes: 57 additions & 0 deletions harambe/contrib/types.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import abc

# noinspection PyUnresolvedReferences,PyProtectedMember
from contextlib import _AsyncGeneratorContextManager
from typing import TypeVar, Generic, Callable, Any

T = TypeVar("T", bound="AbstractElementHandle")
WebHarness = Callable[
[bool, str | None, str | None], _AsyncGeneratorContextManager["AbstractPage[T]"]
]


class AbstractElementHandle(abc.ABC):
@abc.abstractmethod
async def inner_text(self) -> str:
raise NotImplementedError()

@abc.abstractmethod
async def get_attribute(self, name: str) -> str:
raise NotImplementedError()

@abc.abstractmethod
async def click(self) -> None:
raise NotImplementedError()


class Selectable(Generic[T], abc.ABC):
@abc.abstractmethod
async def query_selector_all(self, selector: str) -> list[T]:
raise NotImplementedError()

@abc.abstractmethod
async def query_selector(self, selector: str) -> T:
raise NotImplementedError()

@abc.abstractmethod
async def wait_for_selector(self, selector: str, **kwargs: Any) -> None:
raise NotImplementedError()


class AbstractPage(Selectable[T], abc.ABC):
@property
@abc.abstractmethod
def url(self) -> str:
raise NotImplementedError()

@abc.abstractmethod
async def goto(self, url: str) -> None:
raise NotImplementedError()

@abc.abstractmethod
async def wait_for_timeout(self, timeout: int) -> None:
raise NotImplementedError()

@abc.abstractmethod
async def content(self) -> str:
raise NotImplementedError()
15 changes: 10 additions & 5 deletions harambe/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,17 @@

import aiohttp
from playwright.async_api import (
Page,
ElementHandle,
TimeoutError as PlaywrightTimeoutError,
Page,
)

from harambe.handlers import (
ResourceRequestHandler,
ResourceType,
)
from harambe.harness import playwright_harness

from harambe.contrib import playwright_harness, WebHarness
from harambe.normalize_url import normalize_url
from harambe.observer import (
LocalStorageObserver,
Expand All @@ -27,6 +28,7 @@
DuplicateHandler,
ObservationTrigger,
)

from harambe.parser.parser import PydanticSchemaParser
from harambe.tracker import FileDataTracker
from harambe.types import (
Expand Down Expand Up @@ -160,7 +162,8 @@ async def paginate(
if next_url.startswith("?"):
# TODO: merge query params
next_url = self.page.url.split("?")[0] + next_url
await self.page.goto(next_url)

await self.page.goto(normalize_url(next_url, self.page.url))
await self.page.wait_for_timeout(timeout)

if next_url:
Expand All @@ -175,7 +178,7 @@ async def paginate(
raise TimeoutError(
f"{e.args[0]} You may increase the timeout by passing `timeout` in ms to `SDK.paginate`. Alternatively, this may mean that the next page element or URL was not found and pagination is complete."
) from e
except (TimeoutError, StopAsyncIteration):
except (TimeoutError, AttributeError, StopAsyncIteration):
return

async def capture_url(
Expand Down Expand Up @@ -280,6 +283,7 @@ async def run(
cdp_endpoint: Optional[str] = None,
proxy: Optional[str] = None,
setup: Optional[SetupType] = None,
harness: WebHarness = playwright_harness,
) -> "SDK":
"""
Convenience method for running a scraper. This will launch a browser and
Expand All @@ -292,6 +296,7 @@ async def run(
:param cdp_endpoint: endpoint to connect to the browser (if using a remote browser)
:param proxy: proxy to use for the browser
:param setup: setup function to run before the scraper
:param harness: the harness to use for the browser
:return none: everything should be saved to the database or file
"""
domain = getattr(scraper, "domain", None)
Expand All @@ -302,7 +307,7 @@ async def run(
if isinstance(url, Path):
url = f"file://{url.resolve()}"

async with playwright_harness(
async with harness(
headless=headless,
cdp_endpoint=cdp_endpoint,
proxy=proxy,
Expand Down
4 changes: 2 additions & 2 deletions harambe/handlers.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import re
from abc import ABC
from typing import Literal
from typing import Literal, Any

from playwright.async_api import Route, Page

Expand Down Expand Up @@ -45,7 +45,7 @@ async def __aenter__(self):
await self.page.context.route(self.url_pattern, self.handle)
return self

async def __aexit__(self, exc_type, exc_val, exc_tb):
async def __aexit__(self, *_: Any, **__: Any):
await self.page.context.unroute(self.url_pattern, self.handle)
await self.page.bring_to_front()
for page in self.page.context.pages:
Expand Down
2 changes: 1 addition & 1 deletion harambe/parser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def _schema_to_pydantic_model(

return create_model(model_name, __config__=config, **fields)

def _get_type(self, field: str) -> Type:
def _get_type(self, field: str) -> Type[Any]:
field_type = self.field_types.get(field)
if not field_type:
raise ValueError(f"Unsupported field type: {field}")
Expand Down
3 changes: 2 additions & 1 deletion harambe/parser/type_phone_number.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import re
from typing import Type

import phonenumbers
from pydantic.functional_validators import AfterValidator
Expand All @@ -14,7 +15,7 @@


class ParserTypePhoneNumber:
def __new__(cls):
def __new__(cls) -> Type[str]:
return Annotated[str, AfterValidator(cls.validate_type)]

@staticmethod
Expand Down
6 changes: 3 additions & 3 deletions harambe/parser/type_url.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Optional
from typing import Optional, Type, Callable
from urllib.parse import urljoin, urlparse

from pydantic.functional_validators import AfterValidator
Expand All @@ -17,11 +17,11 @@


class ParserTypeUrl:
def __new__(cls, base_url: Optional[URL] = None):
def __new__(cls, base_url: Optional[URL] = None) -> Type[str]:
return Annotated[str, AfterValidator(cls.validate_type(base_url))]

@staticmethod
def validate_type(base_url: Optional[URL]):
def validate_type(base_url: Optional[URL]) -> Callable[[URL], str]:
def _validate_type(url: URL) -> str:
# Transform relative URLs into absolute using base_url
if base_url is not None:
Expand Down
30 changes: 30 additions & 0 deletions harambe/proxy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from urllib.parse import urlparse

from playwright.async_api import ProxySettings


def proxy_from_url(url: str) -> ProxySettings:
parsed = urlparse(url, allow_fragments=False)

if not parsed.hostname:
parsed = urlparse(f"http://{url}", allow_fragments=False)

if not all(
[
parsed.hostname,
parsed.username,
parsed.password,
]
):
raise ValueError(f"Invalid proxy URL: {url}")

proxy: ProxySettings = {
"server": parsed.hostname,
"username": parsed.username,
"password": parsed.password,
}

if parsed.port:
proxy["server"] += f":{parsed.port}"

return proxy
Loading
Loading