Skip to content

Commit

Permalink
v0.1.6 Custom scripts execution
Browse files Browse the repository at this point in the history
  • Loading branch information
raznem committed Aug 31, 2024
1 parent 6a6db8a commit c3ad761
Show file tree
Hide file tree
Showing 6 changed files with 151 additions and 20 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -162,3 +162,4 @@ cython_debug/
#.idea/

.DS_Store
raw_notebooks
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
release:
poetry publish --build

4 changes: 2 additions & 2 deletions parsera/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from parsera.main import Parsera
from parsera.main import ExtractorType, Parsera, ParseraScript

__all__ = ["Parsera"]
__all__ = ["ExtractorType", "Parsera", "ParseraScript"]
103 changes: 97 additions & 6 deletions parsera/main.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,26 @@
import asyncio
import enum
from typing import Awaitable, Callable

from langchain_core.language_models import BaseChatModel
from playwright.async_api import Page

from parsera.engine.model import GPT4oMiniModel
from parsera.engine.simple_extractor import (
ItemExtractor,
ListExtractor,
TabularExtractor,
)
from parsera.page import PageLoader, fetch_page_content
from parsera.page import PageLoader


class Parsera:
class ExtractorType(enum.Enum):
LIST = ListExtractor
TABULAR = TabularExtractor
ITEM = ItemExtractor
class ExtractorType(enum.Enum):
LIST = ListExtractor
TABULAR = TabularExtractor
ITEM = ItemExtractor


class Parsera:
def __init__(
self,
model: BaseChatModel | None = None,
Expand Down Expand Up @@ -51,3 +54,91 @@ async def arun(
return await self._run(
url=url, elements=elements, proxy_settings=proxy_settings
)


class ParseraScript(Parsera):
def __init__(
self,
model: BaseChatModel | None = None,
extractor: ExtractorType = ExtractorType.TABULAR,
initial_script: Callable[[Page], Awaitable[Page]] | None = None,
stealth: bool = True,
):
super().__init__(model=model, extractor=extractor)
self.initial_script = initial_script
self.stealth = stealth

async def new_session(
self,
proxy_settings: dict | None = None,
initial_script: Callable[[Page], Awaitable[Page]] | None = None,
stealth: bool = True,
) -> None:
await self.loader.create_session(
proxy_settings=proxy_settings,
playwright_script=initial_script,
stealth=stealth,
)

async def extract_page(
self,
url: str,
elements: dict,
playwright_script: Callable[[Page], Awaitable[Page]] | None = None,
):
content = await self.loader.fetch_page(
url=url, playwright_script=playwright_script
)

extractor_instance = self.extractor.value(
elements=elements, model=self.model, content=content
)
result = await extractor_instance.run()
return result

async def _run(
self,
url: str,
elements: dict,
proxy_settings: dict | None = None,
playwright_script: Callable[[Page], Awaitable[Page]] | None = None,
):
if self.loader.context is None:
await self.new_session(
proxy_settings=proxy_settings,
initial_script=self.initial_script,
stealth=self.stealth,
)
return await self.extract_page(
url=url, elements=elements, playwright_script=playwright_script
)

def run(
self,
url: str,
elements: dict,
proxy_settings: dict | None = None,
playwright_script: Callable[[Page], Awaitable[Page]] | None = None,
) -> dict:
return asyncio.run(
self._run(
url=url,
elements=elements,
proxy_settings=proxy_settings,
playwright_script=playwright_script,
)
)

async def arun(
self,
url: str,
elements: dict,
proxy_settings: dict | None = None,
playwright_script: Callable[[Page], Awaitable[Page]] | None = None,
) -> dict:
return await self._run(
url=url,
elements=elements,
proxy_settings=proxy_settings,
playwright_script=playwright_script,
)
58 changes: 47 additions & 11 deletions parsera/page.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import warnings
from typing import Literal, TypedDict
from typing import Awaitable, Callable, Literal, TypedDict

from playwright.async_api import (
Browser,
Expand All @@ -8,7 +8,7 @@
Playwright,
async_playwright,
)
from playwright_stealth import stealth_async
from playwright_stealth import StealthConfig, stealth_async


class ProxySettings(TypedDict, total=False):
Expand Down Expand Up @@ -41,27 +41,63 @@ async def new_browser(self) -> None:
else:
self.browser = await self.playwright.chromium.launch(headless=True)

async def load_content(
async def stealth(self, page: Page) -> Page:
user_agent = await self.page.evaluate("navigator.userAgent")
user_agent = user_agent.replace("HeadlessChrome/", "Chrome/")
await self.context.close()

self.context = await self.browser.new_context(user_agent=user_agent)
page = await self.context.new_page()
await stealth_async(page, config=StealthConfig(navigator_user_agent=False))
return page

async def create_session(
self,
url: str,
proxy_settings: ProxySettings | None = None,
new_browser: bool = True,
load_state: Literal[
"domcontentloaded", "load", "networkidle"
] = "domcontentloaded",
playwright_script: Callable[[Page], Awaitable[Page]] | None = None,
stealth: bool = True,
) -> None:
if new_browser:
if not self.browser:
await self.new_browser()
self.context = await self.browser.new_context(proxy=proxy_settings)

self.page = await self.context.new_page()
await stealth_async(self.page)
if stealth:
self.page = await self.stealth(page=self.page)

if playwright_script:
self.page = await playwright_script(self.page)

async def fetch_page(
self,
url: str,
load_state: Literal[
"domcontentloaded", "load", "networkidle"
] = "domcontentloaded",
playwright_script: Callable[[Page], Awaitable[Page]] | None = None,
) -> None:
# Navigate to the URL
# await page.route("**/*.{png,jpg,jpeg}", lambda route: route.abort()) # Can speed up requests
await self.page.goto(url)
await self.page.wait_for_load_state(load_state)
if playwright_script:
self.page = await playwright_script(self.page)

return await self.page.content()

async def load_content(
self,
url: str,
proxy_settings: ProxySettings | None = None,
load_state: Literal[
"domcontentloaded", "load", "networkidle"
] = "domcontentloaded",
playwright_script: Callable[[Page], Awaitable[Page]] | None = None,
):
await self.create_session(proxy_settings=proxy_settings)
return await self.fetch_page(
url=url, load_state=load_state, playwright_script=playwright_script
)

async def close(self) -> None:
if self.playwright:
await self.browser.close()
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "parsera"
version = "0.1.5"
version = "0.1.6"
description = "Lightweight library for scraping web-sites with LLMs"
authors = ["Mikhail Zanka <[email protected]>"]
license = "GPL-2.0-or-later"
Expand Down

0 comments on commit c3ad761

Please sign in to comment.