Skip to content

Commit

Permalink
✨ Follow dynamically-built URLs (#146)
Browse files Browse the repository at this point in the history
* ✨ Follow dynamically-built URLs

* Add tests and documentation

* Update OSX chromedriver version
  • Loading branch information
roniemartinez authored May 5, 2022
1 parent 391771f commit 3032901
Show file tree
Hide file tree
Showing 10 changed files with 164 additions and 3 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ jobs:
- os: macos-latest
pip-cache: ~/Library/Caches/pip
poetry-cache: ~/Library/Caches/pypoetry
chromedriver-version: 99.0.4844.51 # https://chromedriver.chromium.org/downloads
chromedriver-version: 100.0.4896.60 # https://chromedriver.chromium.org/downloads
- os: windows-latest
pip-cache: ~\AppData\Local\pip\Cache
poetry-cache: ~\AppData\Local\pypoetry\Cache
Expand Down
40 changes: 40 additions & 0 deletions docs/advanced/16_helper_functions.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Helper Functions

Here is a list of functions that can be useful for web scraping.

## `follow_url()`

This function allows adding dynamically created URLs to the list of URLs to be scraped.

=== "Python"

```python
from dude import select, follow_url


@select(css=".url", group_css=".custom-group")
def url(element: BeautifulSoup) -> Dict:

follow_url(element["href"])

return {"url": element["href"]}
```

## `get_current_url()`

This functions allows access to the current URL that is being scraped.
It can be useful when used together with `follow_url()` function.

=== "Python"

```python
from dude import select, follow_url, get_current_url


@select(css=".url", group_css=".custom-group")
def url(element: BeautifulSoup) -> Dict:

follow_url(urljoin(get_current_url(), element["href"]))

return {"url": element["href"]}
```
14 changes: 13 additions & 1 deletion dude/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,18 @@
from pathlib import Path
from typing import Any

from .context import group, post_setup, pre_setup, run, save, select, shutdown, start_requests, startup # noqa: F401
from .context import ( # noqa: F401
get_current_url,
group,
post_setup,
pre_setup,
run,
save,
select,
shutdown,
start_requests,
startup,
)
from .scraper import Scraper # noqa: F401

EXTRA_EXPORTS = []
Expand All @@ -25,6 +36,7 @@
"pre_setup",
"post_setup",
"start_requests",
"current_url",
] + EXTRA_EXPORTS


Expand Down
8 changes: 8 additions & 0 deletions dude/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ class ScraperBase(ABC):
"""

supports_sync = True
current_url = ""

def __init__(
self,
Expand Down Expand Up @@ -349,6 +350,12 @@ def wrapper(func: Callable) -> Callable:

return wrapper

def get_current_url(self) -> str:
return self.scraper.current_url if self.scraper else self.current_url

def follow_url(self, url: str) -> None:
self.scraper.urls.append(url) if self.scraper else self.urls.append(url)

def iter_urls(self) -> Iterator[str]:
try:
while True:
Expand All @@ -361,6 +368,7 @@ def iter_urls(self) -> Iterator[str]:
logger.info("Not allowed to crawl %s", url)
continue
time.sleep(crawl_delay)
self.current_url = url
yield url
except IndexError:
pass
Expand Down
2 changes: 2 additions & 0 deletions dude/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,5 @@
pre_setup = _scraper.pre_setup
post_setup = _scraper.post_setup
start_requests = _scraper.start_requests
get_current_url = _scraper.get_current_url
follow_url = _scraper.follow_url
1 change: 1 addition & 0 deletions dude/optional/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ def iter_requests(self) -> Iterable[Request]:
logger.info("Not allowed to crawl %s", str(request.url))
continue
time.sleep(crawl_delay)
self.current_url = str(request.url) # type: ignore
yield request
except IndexError:
pass
1 change: 1 addition & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ nav:
- Selenium Scraper: advanced/13_selenium.md
- Events: advanced/14_events.md
- "@start_requests": advanced/15_start_requests.md
- Helper Functions: advanced/16_helper_functions.md
- Supported Parser Backends:
- supported_parser_backends/index.md
- Migrating Your Web Scrapers to Dude: supported_parser_backends/migrating.md
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "pydude"
version = "0.18.0"
version = "0.19.0"
repository = "https://github.com/roniemartinez/dude"
description = "dude uncomplicated data extraction"
authors = ["Ronie Martinez <[email protected]>"]
Expand Down
50 changes: 50 additions & 0 deletions tests/test_bs4.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,24 @@ def save_to_database(data: Any, output: Optional[str]) -> bool:
return True


@pytest.fixture()
def bs4_follow_url(scraper_application: Scraper) -> None:
@scraper_application.group(css=".custom-group")
@scraper_application.select(css=".title")
def title(element: BeautifulSoup) -> Dict:
return {"title": element.get_text()}

@scraper_application.group(css=".custom-group")
@scraper_application.select(css=".title", url_match="example.com")
def url_dont_match(element: BeautifulSoup) -> Dict:
return {"title": element.get_text()}

@scraper_application.select(css=".url", group_css=".custom-group")
def url(element: BeautifulSoup) -> Dict:
scraper_application.follow_url(urljoin(scraper_application.get_current_url(), element["href"]))
return {"url": element["href"]}


def test_full_flow_bs4(
scraper_application: Scraper,
bs4_select: None,
Expand All @@ -171,6 +189,38 @@ def test_full_flow_bs4(
mock_database.save.assert_not_called()


def test_follow_url(
scraper_application: Scraper,
bs4_follow_url: None,
expected_data: List[Dict],
base_url: str,
scraper_save: None,
mock_database: mock.MagicMock,
mock_database_per_page: mock.MagicMock,
mock_httpx: Router,
) -> None:
assert scraper_application.has_async is False
assert len(scraper_application.rules) == 3

scraper_application.run(
urls=[base_url],
pages=2,
format="custom",
parser="bs4",
follow_urls=False,
save_per_page=True,
)

called_urls = [str(request.url) for request, _ in mock_httpx.calls]
assert urljoin(base_url, "/") in called_urls
assert urljoin(base_url, "url-1.html") in called_urls
assert urljoin(base_url, "url-2.html") in called_urls
assert urljoin(base_url, "url-3.html") in called_urls

mock_database_per_page.save.assert_called_with(expected_data)
mock_database.save.assert_not_called()


def test_bs4_httpx_exception(
scraper_application: Scraper,
bs4_select: None,
Expand Down
47 changes: 47 additions & 0 deletions tests/test_playwright_sync.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import json
from typing import Any, Dict, List, Optional
from unittest import mock
from urllib.parse import urljoin

import pytest
import yaml
Expand Down Expand Up @@ -131,6 +132,24 @@ def save_to_database(data: Any, output: Optional[str]) -> bool:
return True


@pytest.fixture()
def playwright_follow_url(scraper_application: Scraper) -> None:
@scraper_application.group(css=".custom-group")
@scraper_application.select(css=".title")
def title(element: sync_api.ElementHandle) -> Dict:
return {"title": element.text_content()}

@scraper_application.group(css=".custom-group")
@scraper_application.select(css=".title", url_match="example.com")
def url_dont_match(element: sync_api.ElementHandle) -> Dict:
return {"title": element.text_content()}

@scraper_application.select(css=".url", group_css=".custom-group")
def url(element: sync_api.ElementHandle) -> Dict:
scraper_application.follow_url(urljoin(scraper_application.get_current_url(), element.get_attribute("href")))
return {"url": element.get_attribute("href")}


@pytest.mark.parametrize(
"browser_type",
(
Expand Down Expand Up @@ -168,6 +187,34 @@ def test_full_flow(
mock_database.close.assert_called_once()


def test_follow_url(
scraper_application: Scraper,
playwright_follow_url: None,
playwright_setup: None,
playwright_navigate: None,
playwright_startup: None,
playwright_pre_setup: None,
playwright_post_setup: None,
playwright_shutdown: None,
scraper_save: None,
expected_browser_data: List[Dict],
file_url: str,
mock_database: mock.MagicMock,
mock_database_per_page: mock.MagicMock,
) -> None:
assert scraper_application.has_async is False
assert len(scraper_application.rules) == 5

scraper_application.run(
urls=[file_url], pages=2, format="custom", parser="playwright", follow_urls=False, save_per_page=True
)

mock_database.setup.assert_called_once()
mock_database_per_page.save.assert_called_with(expected_browser_data)
mock_database.save.assert_not_called()
mock_database.close.assert_called_once()


def test_full_flow_xpath(
scraper_application: Scraper,
playwright_xpath: None,
Expand Down

0 comments on commit 3032901

Please sign in to comment.