Skip to content

Commit

Permalink
Improve page gathering
Browse files Browse the repository at this point in the history
Add iframes rendering support
  • Loading branch information
danyathecoder authored and raznem committed Nov 29, 2024
1 parent 778ade6 commit c12889a
Showing 1 changed file with 30 additions and 5 deletions.
35 changes: 30 additions & 5 deletions parsera/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ async def new_browser(self) -> None:
await self.browser.close()

# slow_mo mode is just only to deal with js rendering
self.browser = await self.playwright.firefox.launch(headless=True, slow_mo=3000)
self.browser = await self.playwright.firefox.launch(headless=True)

async def stealth(
self,
Expand Down Expand Up @@ -149,13 +149,38 @@ async def scroll_page(self, scrolls_limit: int = 0):

return final_content

async def get_iframe_html(self, frame):
try:
if frame.is_detached(): # Skip detached frames
return None
return await frame.evaluate("document.documentElement.outerHTML")
except Exception as e:
print(f"Could not access iframe: {e}")
return None

async def get_full_html(self):
# Get main document HTML
main_html = await self.page.evaluate("document.documentElement.outerHTML")

# Fetch all iframe HTMLs in parallel
iframe_html_tasks = [self.get_iframe_html(frame) for frame in self.page.frames]
iframes_html = await asyncio.gather(*iframe_html_tasks)

# Filter out None values (failed iframe retrievals)
iframes_html = [html for html in iframes_html if html is not None]

# Combine main HTML and iframe HTML into one variable
combined_html = f"<!-- Main Page HTML -->\n{main_html}\n"
for idx, iframe_html in enumerate(iframes_html):
combined_html += f"\n<!-- Iframe {idx + 1} HTML -->\n{iframe_html}\n"

return combined_html

async def fetch_page(
self,
url: str,
scrolls_limit: int = 0,
load_state: Literal[
"domcontentloaded", "load", "networkidle"
] = "domcontentloaded",
load_state: Literal["domcontentloaded", "load", "networkidle"] = "networkidle",
playwright_script: Callable[[Page], Awaitable[Page]] | None = None,
) -> None:
# Navigate to the URL
Expand All @@ -175,7 +200,7 @@ async def fetch_page(
if scrolls_limit > 0:
result = await self.scroll_page(scrolls_limit)
else:
result = await self.page.content()
result = await self.get_full_html()

return result

Expand Down

0 comments on commit c12889a

Please sign in to comment.