Skip to content

Commit

Permalink
Merge pull request #2 from reworkd/paginate
Browse files Browse the repository at this point in the history
sleep -> timeout fixes
  • Loading branch information
awtkns authored Mar 16, 2024
2 parents 6dfb397 + c0e218b commit d8e0d13
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 18 deletions.
9 changes: 8 additions & 1 deletion harambe/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,11 @@
from .types import AsyncScraperType, ScrapeResult
from .utils import PlaywrightUtils

__all__ = ["ScrapeResult", "SDK", "PlaywrightUtils", "AsyncScraperType", "AsyncScraper", "PAGE_PDF_FILENAME"]
__all__ = [
"ScrapeResult",
"SDK",
"PlaywrightUtils",
"AsyncScraperType",
"AsyncScraper",
"PAGE_PDF_FILENAME",
]
27 changes: 14 additions & 13 deletions harambe/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ async def enqueue(self, *urls: URL, context: Optional[Context] = None) -> None:
async def paginate(
self,
next_page: Callable[..., Awaitable[URL | ElementHandle | None]],
sleep: int = 0,
timeout: int = 5000,
) -> None:
"""
Navigate to the next page of a listing.
Expand All @@ -117,7 +117,7 @@ async def paginate(

next_url = ""
if isinstance(next_page, ElementHandle):
await next_page.click(timeout=1000)
await next_page.click(timeout=timeout)
next_url = self.page.url

elif isinstance(next_page, str):
Expand All @@ -128,9 +128,9 @@ async def paginate(
await self.page.goto(next_url)

if next_url:
if sleep > 0:
await asyncio.sleep(sleep)
await self._scraper(self, next_url, self._context)
await self._scraper(
self, next_url, self._context
) # TODO: eventually fix this to not be recursive
except: # noqa: E722
return

Expand Down Expand Up @@ -188,11 +188,16 @@ async def capture_pdf(
Capture the current page as a pdf and then apply some download handling logic
from the observer to transform to a usable URL
"""
await self.page.wait_for_timeout(1000) # Allow for some extra time for the page to load
await self.page.wait_for_timeout(
1000
) # Allow for some extra time for the page to load
pdf_content = await self.page.pdf()
file_name = PAGE_PDF_FILENAME
res = await asyncio.gather(
*[o.on_download(self.page.url, file_name, pdf_content) for o in self._observers]
*[
o.on_download(self.page.url, file_name, pdf_content)
for o in self._observers
]
)
return res[0]

Expand Down Expand Up @@ -248,15 +253,11 @@ async def run(
)
except Exception as e:
# TODO: Fix path for non Mr. Watkins
await ctx.tracing.stop(
path="/Users/awtkns/PycharmProjects/harambe-public/trace.zip"
)
await ctx.tracing.stop(path="trace.zip")
await browser.close()
raise e
else:
await ctx.tracing.stop(
path="/Users/awtkns/PycharmProjects/harambe-public/trace.zip"
)
await ctx.tracing.stop(path="trace.zip")
await browser.close()

@staticmethod
Expand Down
16 changes: 12 additions & 4 deletions harambe/observer.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@ async def on_queue_url(self, url: URL, context: Dict[str, Any]) -> None:
raise NotImplementedError()

@abstractmethod
async def on_download(self, download_url: str, filename: str, content: bytes) -> "DownloadMeta":
async def on_download(
self, download_url: str, filename: str, content: bytes
) -> "DownloadMeta":
raise NotImplementedError()


Expand All @@ -29,7 +31,9 @@ async def on_save_data(self, data: Dict[str, Any]):
async def on_queue_url(self, url: URL, context: Dict[str, Any]) -> None:
print(f"Enqueuing: {url} with context {context}")

async def on_download(self, download_url: str, filename: str, content: bytes) -> "DownloadMeta":
async def on_download(
self, download_url: str, filename: str, content: bytes
) -> "DownloadMeta":
print(f"Downloading file: {filename}") # TODO: use logger
return {
"url": f"{download_url}/{quote(filename)}",
Expand All @@ -47,7 +51,9 @@ async def on_save_data(self, data: Dict[str, Any]):
async def on_queue_url(self, url: URL, context: Dict[str, Any]) -> None:
self._tracker.save_data({"url": url, "context": context})

async def on_download(self, download_url: str, filename: str, content: bytes) -> "DownloadMeta":
async def on_download(
self, download_url: str, filename: str, content: bytes
) -> "DownloadMeta":
data = {
"url": f"{download_url}/{quote(filename)}",
"filename": filename,
Expand All @@ -68,7 +74,9 @@ async def on_save_data(self, data: Dict[str, Any]):
async def on_queue_url(self, url: URL, context: Dict[str, Any]) -> None:
self._urls.append((url, context))

async def on_download(self, download_url: str, filename: str, content: bytes) -> "DownloadMeta":
async def on_download(
self, download_url: str, filename: str, content: bytes
) -> "DownloadMeta":
data = {
"url": f"{download_url}/{quote(filename)}",
"filename": filename,
Expand Down

0 comments on commit d8e0d13

Please sign in to comment.