Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main'
Browse files Browse the repository at this point in the history
  • Loading branch information
awtkns committed Jun 28, 2024
2 parents 2136890 + b27535f commit 3faab54
Show file tree
Hide file tree
Showing 10 changed files with 304 additions and 41 deletions.
5 changes: 2 additions & 3 deletions harambe/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,7 @@ class AsyncScraper(Protocol):
Note that scrapers in harambe should be functions, not classes.
"""

async def scrape(self, sdk: "SDK", url: URL, context: Context) -> None:
...
async def scrape(self, sdk: "SDK", url: URL, context: Context) -> None: ...


class SDK:
Expand Down Expand Up @@ -105,7 +104,7 @@ async def save_data(self, *data: ScrapeResult) -> None:
url = self.page.url
for d in data:
if self._validator is not None:
self._validator.validate(d, base_url=self.page.url)
d = self._validator.validate(d, base_url=self.page.url)
d["__url"] = url
await self._notify_observers("on_save_data", d)

Expand Down
19 changes: 11 additions & 8 deletions harambe/parser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class SchemaParser(ABC):
"""

@abstractmethod
def validate(self, data: Dict[str, Any]) -> None:
def validate(self, data: Dict[str, Any], base_url: URL) -> None:
pass


Expand All @@ -39,21 +39,24 @@ class PydanticSchemaParser(SchemaParser):

def __init__(self, schema: Schema):
self.schema = schema
self.base_url = None
self.model = None
self.field_types = None

def validate(self, data: Dict[str, Any], base_url: URL) -> Dict[str, Any]:
# Set these values here for convenience to avoid passing them around. A bit hacky
self.field_types = self._get_field_types(base_url)

def validate(self, data: Dict[str, Any], base_url: URL) -> None:
self.base_url = base_url
self.field_types = self._get_field_types()
self.model = self._schema_to_pydantic_model(self.schema)

try:
self.model(**data)
return self.model(**data).dict()
except ValidationError as validation_error:
raise SchemaValidationError(
data=data, schema=self.schema, message=validation_error
)

def _get_field_types(self) -> Dict[str, Type]:
@staticmethod
def _get_field_types(base_url: str) -> Dict[str, Type]:
return {
"string": str,
"str": str,
Expand All @@ -70,7 +73,7 @@ def _get_field_types(self) -> Dict[str, Type]:
OBJECT_TYPE: Dict[str, Any],
"datetime": ParserTypeDate(),
"phone_number": ParserTypePhoneNumber(),
"url": ParserTypeUrl(base_url=self.base_url),
"url": ParserTypeUrl(base_url=base_url),
}

def _items_schema_to_python_type(
Expand Down
56 changes: 40 additions & 16 deletions harambe/parser/type_date.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,53 @@
from pydantic.functional_validators import AfterValidator
from typing_extensions import Annotated
import re
from datetime import datetime

import dateparser
from pydantic.functional_validators import AfterValidator
from typing_extensions import Annotated


class ParserTypeDate:
def __new__(self):
return Annotated[str, AfterValidator(self.validate_type)]
def __new__(cls):
return Annotated[str, AfterValidator(cls.validate_type)]

def validate_type(date: str):
if not isinstance(date, str):
raise ValueError("Wrong input type")
@staticmethod
def validate_type(date: str) -> str:
# Cast to string incase the date is a datetime float/number
date = str(date)

# Trim whitespaces
date = date.strip()

# Make sure it's not empty string
# Make sure it's not an empty string
if len(date) == 0:
raise ValueError("Empty input")

# Attempt to parse date string
try:
dateparser.parse(date)
return date
except ValueError:
pass

raise ValueError(f"Unable to parse input as date: {date}")
# Attempt to parse date string using dateparser
parsed_date = dateparser.parse(date)

if parsed_date is None:
# Remove timezone abbreviation in parentheses if present
date = re.sub(r"\s*\(.*\)$", "", date).strip()

# List of datetime formats to try
datetime_formats = [
"%m/%d/%Y %I:%M:%S %p", # 4/30/2024 09:00:02 AM
"%Y-%m-%dT%H:%M:%S", # 2024-04-30T09:00:02
"%Y-%m-%d %H:%M:%S", # 2024-04-30 09:00:02
"%B %d, %Y - %I:%M%p", # May 14, 2024 - 2:00pm
"%m/%d/%Y", # 4/30/2024
]

# Attempt to parse using datetime with specific formats
for date_format in datetime_formats:
try:
parsed_date = datetime.strptime(date, date_format)
break
except ValueError:
continue

if parsed_date is None:
raise ValueError(f"Unable to parse input as date: {date}")

# Return the date in ISO 8601 format
return parsed_date.isoformat()
24 changes: 21 additions & 3 deletions harambe/parser/type_phone_number.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from pydantic.functional_validators import AfterValidator
import re

import phonenumbers
from pydantic.functional_validators import AfterValidator
from typing_extensions import Annotated

phone_number_formats = [
Expand All @@ -12,12 +14,28 @@


class ParserTypePhoneNumber:
def __new__(self):
return Annotated[str, AfterValidator(self.validate_type)]
def __new__(cls):
return Annotated[str, AfterValidator(cls.validate_type)]

@staticmethod
def validate_type(number: str) -> str:
# Trim whitespaces
formatted_number = number.strip()

# First, try using the phonenumbers library
try:
phone_number = phonenumbers.parse(
formatted_number, None
) # 'None' implies no specific region
if phonenumbers.is_valid_number(phone_number):
# Return the phone number in international format
return phonenumbers.format_number(
phone_number, phonenumbers.PhoneNumberFormat.INTERNATIONAL
)
except phonenumbers.phonenumberutil.NumberParseException:
pass

# If phonenumbers library fails, fall back to regex validation
# Remove plus sign
formatted_number = number.replace("+", "")
# Attempt to parse phone number
Expand Down
15 changes: 8 additions & 7 deletions harambe/parser/type_url.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from pydantic.functional_validators import AfterValidator
from typing import Optional
from typing_extensions import Annotated
import urllib.parse
from urllib.parse import urljoin, urlparse

from pydantic.functional_validators import AfterValidator
from typing_extensions import Annotated

from harambe.types import URL

allowed_url_schemes = [
Expand All @@ -17,9 +17,10 @@


class ParserTypeUrl:
def __new__(self, base_url: Optional[URL] = None):
return Annotated[URL, AfterValidator(self.validate_type(base_url))]
def __new__(cls, base_url: Optional[URL] = None):
return Annotated[str, AfterValidator(cls.validate_type(base_url))]

@staticmethod
def validate_type(base_url: Optional[URL]):
def _validate_type(url: URL) -> str:
# Transform relative URLs into absolute using base_url
Expand All @@ -29,13 +30,13 @@ def _validate_type(url: URL) -> str:

# Parse the URL
try:
parsed_url = urllib.parse.urlparse(url)
parsed_url = urlparse(url)
except ValueError as e:
raise ValueError(f"Unable to parse URL: {url}", e)

# Check if the scheme is allowed
if parsed_url.scheme not in allowed_url_schemes:
raise ValueError(f"Invalid URL: {url}")
raise ValueError(f"Invalid URL scheme: {url}")

return url

Expand Down
15 changes: 13 additions & 2 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "harambe-sdk"
version = "0.13.6"
version = "0.14.0"
description = "Data extraction SDK for Playwright 🐒🍌"
authors = ["awtkns <[email protected]>"]
readme = "README.md"
Expand All @@ -20,6 +20,7 @@ requests = "^2.32.3"
playwright-stealth = "^1.0.6" # TODO: self host this package
aiohttp = "^3.9.5"
email-validator = "^2.2.0"
phonenumbers = "^8.13.39"

[tool.poetry.group.dev.dependencies]
ruff = "^0.4.10"
Expand Down
57 changes: 57 additions & 0 deletions tests/parser/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,3 +130,60 @@
"description": "Purely to cause error in the test",
},
}

datetime_schema = {
"event": {
"type": "object",
"properties": {
"name": {"type": "string", "description": "The name of the event"},
"date": {"type": "datetime", "description": "The date of the event"},
},
}
}

phone_number_schema = {
"contact": {
"type": "object",
"properties": {
"name": {"type": "string", "description": "The name of the contact"},
"phone": {"type": "phone_number", "description": "The phone number"},
},
}
}

url_schema = {
"resource": {
"type": "object",
"properties": {
"name": {"type": "string", "description": "The name of the resource"},
"link": {"type": "url", "description": "A link to the resource"},
},
}
}

object_with_nested_types_schema = {
"profile": {
"type": "object",
"properties": {
"user": {"type": "string", "description": "Username"},
"contact": {"type": "phone_number", "description": "Contact number"},
"event_date": {"type": "datetime", "description": "Event date"},
"website": {"type": "url", "description": "Website URL"},
},
}
}

list_with_nested_types_schema = {
"events": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {"type": "string"},
"dates": {"type": "array", "items": {"type": "datetime"}},
"contacts": {"type": "array", "items": {"type": "phone_number"}},
"links": {"type": "array", "items": {"type": "url"}},
},
},
}
}
Loading

0 comments on commit 3faab54

Please sign in to comment.