✨ Follow dynamically-built URLs (#146)

* ✨ Follow dynamically-built URLs * Add tests and documentation * Update OSX chromedriver version
roniemartinez · May 5, 2022 · 3032901 · 3032901
1 parent 391771f
commit 3032901
Show file tree

Hide file tree

Showing 10 changed files with 164 additions and 3 deletions.
diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
@@ -31,7 +31,7 @@ jobs:
           - os: macos-latest
             pip-cache: ~/Library/Caches/pip
             poetry-cache: ~/Library/Caches/pypoetry
-            chromedriver-version: 99.0.4844.51  # https://chromedriver.chromium.org/downloads
+            chromedriver-version: 100.0.4896.60  # https://chromedriver.chromium.org/downloads
           - os: windows-latest
             pip-cache: ~\AppData\Local\pip\Cache
             poetry-cache: ~\AppData\Local\pypoetry\Cache

diff --git a/docs/advanced/16_helper_functions.md b/docs/advanced/16_helper_functions.md
@@ -0,0 +1,40 @@
+# Helper Functions
+
+Here is a list of functions that can be useful for web scraping.
+
+## `follow_url()`
+
+This function allows adding dynamically created URLs to the list of URLs to be scraped.
+
+=== "Python"
+
+    ```python
+    from dude import select, follow_url
+
+
+    @select(css=".url", group_css=".custom-group")
+    def url(element: BeautifulSoup) -> Dict:
+
+        follow_url(element["href"])
+
+        return {"url": element["href"]}
+    ```
+
+## `get_current_url()`
+
+This functions allows access to the current URL that is being scraped.
+It can be useful when used together with `follow_url()` function.
+
+=== "Python"
+
+    ```python
+    from dude import select, follow_url, get_current_url
+
+
+    @select(css=".url", group_css=".custom-group")
+    def url(element: BeautifulSoup) -> Dict:
+
+        follow_url(urljoin(get_current_url(), element["href"]))
+
+        return {"url": element["href"]}
+    ```
diff --git a/dude/__init__.py b/dude/__init__.py
@@ -3,7 +3,18 @@
 from pathlib import Path
 from typing import Any
 
-from .context import group, post_setup, pre_setup, run, save, select, shutdown, start_requests, startup  # noqa: F401
+from .context import (  # noqa: F401
+    get_current_url,
+    group,
+    post_setup,
+    pre_setup,
+    run,
+    save,
+    select,
+    shutdown,
+    start_requests,
+    startup,
+)
 from .scraper import Scraper  # noqa: F401
 
 EXTRA_EXPORTS = []
@@ -25,6 +36,7 @@
     "pre_setup",
     "post_setup",
     "start_requests",
+    "current_url",
 ] + EXTRA_EXPORTS
 
 

diff --git a/dude/base.py b/dude/base.py
@@ -43,6 +43,7 @@ class ScraperBase(ABC):
     """
 
     supports_sync = True
+    current_url = ""
 
     def __init__(
         self,
@@ -349,6 +350,12 @@ def wrapper(func: Callable) -> Callable:
 
         return wrapper
 
+    def get_current_url(self) -> str:
+        return self.scraper.current_url if self.scraper else self.current_url
+
+    def follow_url(self, url: str) -> None:
+        self.scraper.urls.append(url) if self.scraper else self.urls.append(url)
+
     def iter_urls(self) -> Iterator[str]:
         try:
             while True:
@@ -361,6 +368,7 @@ def iter_urls(self) -> Iterator[str]:
                     logger.info("Not allowed to crawl %s", url)
                     continue
                 time.sleep(crawl_delay)
+                self.current_url = url
                 yield url
         except IndexError:
             pass

diff --git a/dude/context.py b/dude/context.py
@@ -14,3 +14,5 @@
 pre_setup = _scraper.pre_setup
 post_setup = _scraper.post_setup
 start_requests = _scraper.start_requests
+get_current_url = _scraper.get_current_url
+follow_url = _scraper.follow_url
diff --git a/dude/optional/utils.py b/dude/optional/utils.py
@@ -60,6 +60,7 @@ def iter_requests(self) -> Iterable[Request]:
                     logger.info("Not allowed to crawl %s", str(request.url))
                     continue
                 time.sleep(crawl_delay)
+                self.current_url = str(request.url)  # type: ignore
                 yield request
         except IndexError:
             pass
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -45,6 +45,7 @@ nav:
       - Selenium Scraper: advanced/13_selenium.md
       - Events: advanced/14_events.md
       - "@start_requests": advanced/15_start_requests.md
+      - Helper Functions: advanced/16_helper_functions.md
   - Supported Parser Backends:
       - supported_parser_backends/index.md
       - Migrating Your Web Scrapers to Dude: supported_parser_backends/migrating.md

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pydude"
-version = "0.18.0"
+version = "0.19.0"
 repository = "https://github.com/roniemartinez/dude"
 description = "dude uncomplicated data extraction"
 authors = ["Ronie Martinez <[email protected]>"]

diff --git a/tests/test_bs4.py b/tests/test_bs4.py
@@ -150,6 +150,24 @@ def save_to_database(data: Any, output: Optional[str]) -> bool:
         return True
 
 
+@pytest.fixture()
+def bs4_follow_url(scraper_application: Scraper) -> None:
+    @scraper_application.group(css=".custom-group")
+    @scraper_application.select(css=".title")
+    def title(element: BeautifulSoup) -> Dict:
+        return {"title": element.get_text()}
+
+    @scraper_application.group(css=".custom-group")
+    @scraper_application.select(css=".title", url_match="example.com")
+    def url_dont_match(element: BeautifulSoup) -> Dict:
+        return {"title": element.get_text()}
+
+    @scraper_application.select(css=".url", group_css=".custom-group")
+    def url(element: BeautifulSoup) -> Dict:
+        scraper_application.follow_url(urljoin(scraper_application.get_current_url(), element["href"]))
+        return {"url": element["href"]}
+
+
 def test_full_flow_bs4(
     scraper_application: Scraper,
     bs4_select: None,
@@ -171,6 +189,38 @@ def test_full_flow_bs4(
     mock_database.save.assert_not_called()
 
 
+def test_follow_url(
+    scraper_application: Scraper,
+    bs4_follow_url: None,
+    expected_data: List[Dict],
+    base_url: str,
+    scraper_save: None,
+    mock_database: mock.MagicMock,
+    mock_database_per_page: mock.MagicMock,
+    mock_httpx: Router,
+) -> None:
+    assert scraper_application.has_async is False
+    assert len(scraper_application.rules) == 3
+
+    scraper_application.run(
+        urls=[base_url],
+        pages=2,
+        format="custom",
+        parser="bs4",
+        follow_urls=False,
+        save_per_page=True,
+    )
+
+    called_urls = [str(request.url) for request, _ in mock_httpx.calls]
+    assert urljoin(base_url, "/") in called_urls
+    assert urljoin(base_url, "url-1.html") in called_urls
+    assert urljoin(base_url, "url-2.html") in called_urls
+    assert urljoin(base_url, "url-3.html") in called_urls
+
+    mock_database_per_page.save.assert_called_with(expected_data)
+    mock_database.save.assert_not_called()
+
+
 def test_bs4_httpx_exception(
     scraper_application: Scraper,
     bs4_select: None,

diff --git a/tests/test_playwright_sync.py b/tests/test_playwright_sync.py
@@ -1,6 +1,7 @@
 import json
 from typing import Any, Dict, List, Optional
 from unittest import mock
+from urllib.parse import urljoin
 
 import pytest
 import yaml
@@ -131,6 +132,24 @@ def save_to_database(data: Any, output: Optional[str]) -> bool:
         return True
 
 
+@pytest.fixture()
+def playwright_follow_url(scraper_application: Scraper) -> None:
+    @scraper_application.group(css=".custom-group")
+    @scraper_application.select(css=".title")
+    def title(element: sync_api.ElementHandle) -> Dict:
+        return {"title": element.text_content()}
+
+    @scraper_application.group(css=".custom-group")
+    @scraper_application.select(css=".title", url_match="example.com")
+    def url_dont_match(element: sync_api.ElementHandle) -> Dict:
+        return {"title": element.text_content()}
+
+    @scraper_application.select(css=".url", group_css=".custom-group")
+    def url(element: sync_api.ElementHandle) -> Dict:
+        scraper_application.follow_url(urljoin(scraper_application.get_current_url(), element.get_attribute("href")))
+        return {"url": element.get_attribute("href")}
+
+
 @pytest.mark.parametrize(
     "browser_type",
     (
@@ -168,6 +187,34 @@ def test_full_flow(
     mock_database.close.assert_called_once()
 
 
+def test_follow_url(
+    scraper_application: Scraper,
+    playwright_follow_url: None,
+    playwright_setup: None,
+    playwright_navigate: None,
+    playwright_startup: None,
+    playwright_pre_setup: None,
+    playwright_post_setup: None,
+    playwright_shutdown: None,
+    scraper_save: None,
+    expected_browser_data: List[Dict],
+    file_url: str,
+    mock_database: mock.MagicMock,
+    mock_database_per_page: mock.MagicMock,
+) -> None:
+    assert scraper_application.has_async is False
+    assert len(scraper_application.rules) == 5
+
+    scraper_application.run(
+        urls=[file_url], pages=2, format="custom", parser="playwright", follow_urls=False, save_per_page=True
+    )
+
+    mock_database.setup.assert_called_once()
+    mock_database_per_page.save.assert_called_with(expected_browser_data)
+    mock_database.save.assert_not_called()
+    mock_database.close.assert_called_once()
+
+
 def test_full_flow_xpath(
     scraper_application: Scraper,
     playwright_xpath: None,