Merge pull request #2 from reworkd/paginate

sleep -> timeout fixes
reworkd · Mar 16, 2024 · d8e0d13 · d8e0d13
2 parents 6dfb397 + c0e218b
commit d8e0d13
Show file tree

Hide file tree

Showing 3 changed files with 34 additions and 18 deletions.
diff --git a/harambe/__init__.py b/harambe/__init__.py
@@ -2,4 +2,11 @@
 from .types import AsyncScraperType, ScrapeResult
 from .utils import PlaywrightUtils
 
-__all__ = ["ScrapeResult", "SDK", "PlaywrightUtils", "AsyncScraperType", "AsyncScraper", "PAGE_PDF_FILENAME"]
+__all__ = [
+    "ScrapeResult",
+    "SDK",
+    "PlaywrightUtils",
+    "AsyncScraperType",
+    "AsyncScraper",
+    "PAGE_PDF_FILENAME",
+]
diff --git a/harambe/core.py b/harambe/core.py
@@ -102,7 +102,7 @@ async def enqueue(self, *urls: URL, context: Optional[Context] = None) -> None:
     async def paginate(
         self,
         next_page: Callable[..., Awaitable[URL | ElementHandle | None]],
-        sleep: int = 0,
+        timeout: int = 5000,
     ) -> None:
         """
         Navigate to the next page of a listing.
@@ -117,7 +117,7 @@ async def paginate(
 
             next_url = ""
             if isinstance(next_page, ElementHandle):
-                await next_page.click(timeout=1000)
+                await next_page.click(timeout=timeout)
                 next_url = self.page.url
 
             elif isinstance(next_page, str):
@@ -128,9 +128,9 @@ async def paginate(
                     await self.page.goto(next_url)
 
             if next_url:
-                if sleep > 0:
-                    await asyncio.sleep(sleep)
-                await self._scraper(self, next_url, self._context)
+                await self._scraper(
+                    self, next_url, self._context
+                )  # TODO: eventually fix this to not be recursive
         except:  # noqa: E722
             return
 
@@ -188,11 +188,16 @@ async def capture_pdf(
         Capture the current page as a pdf and then apply some download handling logic
         from the observer to transform to a usable URL
         """
-        await self.page.wait_for_timeout(1000)  # Allow for some extra time for the page to load
+        await self.page.wait_for_timeout(
+            1000
+        )  # Allow for some extra time for the page to load
         pdf_content = await self.page.pdf()
         file_name = PAGE_PDF_FILENAME
         res = await asyncio.gather(
-            *[o.on_download(self.page.url, file_name, pdf_content) for o in self._observers]
+            *[
+                o.on_download(self.page.url, file_name, pdf_content)
+                for o in self._observers
+            ]
         )
         return res[0]
 
@@ -248,15 +253,11 @@ async def run(
                 )
             except Exception as e:
                 # TODO: Fix path for non Mr. Watkins
-                await ctx.tracing.stop(
-                    path="/Users/awtkns/PycharmProjects/harambe-public/trace.zip"
-                )
+                await ctx.tracing.stop(path="trace.zip")
                 await browser.close()
                 raise e
             else:
-                await ctx.tracing.stop(
-                    path="/Users/awtkns/PycharmProjects/harambe-public/trace.zip"
-                )
+                await ctx.tracing.stop(path="trace.zip")
                 await browser.close()
 
     @staticmethod

diff --git a/harambe/observer.py b/harambe/observer.py
@@ -17,7 +17,9 @@ async def on_queue_url(self, url: URL, context: Dict[str, Any]) -> None:
         raise NotImplementedError()
 
     @abstractmethod
-    async def on_download(self, download_url: str, filename: str, content: bytes) -> "DownloadMeta":
+    async def on_download(
+        self, download_url: str, filename: str, content: bytes
+    ) -> "DownloadMeta":
         raise NotImplementedError()
 
 
@@ -29,7 +31,9 @@ async def on_save_data(self, data: Dict[str, Any]):
     async def on_queue_url(self, url: URL, context: Dict[str, Any]) -> None:
         print(f"Enqueuing: {url} with context {context}")
 
-    async def on_download(self, download_url: str, filename: str, content: bytes) -> "DownloadMeta":
+    async def on_download(
+        self, download_url: str, filename: str, content: bytes
+    ) -> "DownloadMeta":
         print(f"Downloading file: {filename}")  # TODO: use logger
         return {
             "url": f"{download_url}/{quote(filename)}",
@@ -47,7 +51,9 @@ async def on_save_data(self, data: Dict[str, Any]):
     async def on_queue_url(self, url: URL, context: Dict[str, Any]) -> None:
         self._tracker.save_data({"url": url, "context": context})
 
-    async def on_download(self, download_url: str, filename: str, content: bytes) -> "DownloadMeta":
+    async def on_download(
+        self, download_url: str, filename: str, content: bytes
+    ) -> "DownloadMeta":
         data = {
             "url": f"{download_url}/{quote(filename)}",
             "filename": filename,
@@ -68,7 +74,9 @@ async def on_save_data(self, data: Dict[str, Any]):
     async def on_queue_url(self, url: URL, context: Dict[str, Any]) -> None:
         self._urls.append((url, context))
 
-    async def on_download(self, download_url: str, filename: str, content: bytes) -> "DownloadMeta":
+    async def on_download(
+        self, download_url: str, filename: str, content: bytes
+    ) -> "DownloadMeta":
         data = {
             "url": f"{download_url}/{quote(filename)}",
             "filename": filename,