Skip to content

Commit

Permalink
🫡 Stop pagination observer
Browse files Browse the repository at this point in the history
  • Loading branch information
awtkns committed Mar 22, 2024
1 parent 01cc243 commit 1928e7e
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 9 deletions.
24 changes: 16 additions & 8 deletions harambe/observer.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import hashlib
import json
from abc import abstractmethod
from typing import (
Any,
Expand Down Expand Up @@ -123,7 +125,7 @@ def files(self) -> List[Tuple[str, bytes]]:

class StopPaginationObserver(OutputObserver):
def __init__(self):
self._saved_data = set()
self._saved_data: set[bytes] = set()
self._paginator_called = False

async def on_save_data(self, data: dict[str, Any]):
Expand All @@ -142,14 +144,20 @@ def on_paginate(self, next_url: str) -> None:
self._paginator_called = True

def _add_data(self, data: Any):
d_set = frozenset(
(item for item in data.items() if not item[0].startswith("__"))
if isinstance(data, dict)
else data
)
if self._paginator_called and d_set in self._saved_data:
hash_value = self.compute_hash(data)

if self._paginator_called and hash_value in self._saved_data:
raise StopAsyncIteration()
self._saved_data.add(d_set)

self._saved_data.add(hash_value)

@staticmethod
def compute_hash(data: Any) -> bytes:
if isinstance(data, dict):
data = {k: v for k, v in data.items() if not k.startswith("__")}

data_str = json.dumps(data, separators=(',', ':'), sort_keys=True)
return hashlib.md5(data_str.encode()).digest()


class DownloadMeta(TypedDict):
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "harambe-sdk"
version = "0.9.0"
version = "0.9.1"
description = "Data extraction SDK for Playwright 🐒🍌"
authors = ["awtkns <[email protected]>"]
readme = "README.md"
Expand Down
17 changes: 17 additions & 0 deletions tests/test_observers.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,23 @@ async def test_stop_pagination_observer_no_duplicate_data():
await observer.on_save_data({"foo": "bar"})
observer.on_paginate("https://example.com/page2")
await observer.on_save_data({"baz": "qux"})
await observer.on_save_data({"foo": [
"bar",
"baz",
]})


@pytest.mark.asyncio
async def test_ignore_underscore_attributes():
observer = StopPaginationObserver()

await observer.on_save_data({"foo": "bar", "__url": "qux"})

observer.on_paginate("https://example.com/page2")
await observer.on_save_data({"qux": "bar", "__url": "qux"})

with pytest.raises(StopAsyncIteration):
await observer.on_save_data({"foo": "bar", "__url": "bad boy asim"})


@pytest.mark.asyncio
Expand Down

0 comments on commit 1928e7e

Please sign in to comment.