Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🩹 Fix parsers and ensure parsers actually update the returned data #33

Merged
merged 10 commits into from
Jun 27, 2024
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion harambe/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ async def save_data(self, *data: ScrapeResult) -> None:
url = self.page.url
for d in data:
if self._validator is not None:
self._validator.validate(d, base_url=self.page.url)
d = self._validator.validate(d, base_url=self.page.url)
d["__url"] = url
await self._notify_observers("on_save_data", d)

Expand Down
19 changes: 11 additions & 8 deletions harambe/parser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class SchemaParser(ABC):
"""

@abstractmethod
def validate(self, data: Dict[str, Any]) -> None:
def validate(self, data: Dict[str, Any], base_url: URL) -> None:
pass


Expand All @@ -39,21 +39,24 @@ class PydanticSchemaParser(SchemaParser):

def __init__(self, schema: Schema):
self.schema = schema
self.base_url = None
self.model = None
self.field_types = None

def validate(self, data: Dict[str, Any], base_url: URL) -> Dict[str, Any]:
# Set these values here for convenience to avoid passing them around. A bit hacky
self.field_types = self._get_field_types(base_url)

def validate(self, data: Dict[str, Any], base_url: URL) -> None:
self.base_url = base_url
self.field_types = self._get_field_types()
self.model = self._schema_to_pydantic_model(self.schema)

try:
self.model(**data)
return self.model(**data).dict()
except ValidationError as validation_error:
raise SchemaValidationError(
data=data, schema=self.schema, message=validation_error
)

def _get_field_types(self) -> Dict[str, Type]:
@staticmethod
def _get_field_types(base_url: str) -> Dict[str, Type]:
return {
"string": str,
"str": str,
Expand All @@ -70,7 +73,7 @@ def _get_field_types(self) -> Dict[str, Type]:
OBJECT_TYPE: Dict[str, Any],
"datetime": ParserTypeDate(),
"phone_number": ParserTypePhoneNumber(),
"url": ParserTypeUrl(base_url=self.base_url),
"url": ParserTypeUrl(base_url=base_url),
}

def _items_schema_to_python_type(
Expand Down
56 changes: 40 additions & 16 deletions harambe/parser/type_date.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,53 @@
from pydantic.functional_validators import AfterValidator
from typing_extensions import Annotated
import re
from datetime import datetime

import dateparser
from pydantic.functional_validators import AfterValidator
from typing_extensions import Annotated


class ParserTypeDate:
def __new__(self):
return Annotated[str, AfterValidator(self.validate_type)]
def __new__(cls):
return Annotated[str, AfterValidator(cls.validate_type)]

def validate_type(date: str):
if not isinstance(date, str):
raise ValueError("Wrong input type")
@staticmethod
def validate_type(date: str) -> str:
# Cast to string incase the date is a datetime float/number
date = str(date)

# Trim whitespaces
date = date.strip()

# Make sure it's not empty string
# Make sure it's not an empty string
if len(date) == 0:
raise ValueError("Empty input")

# Attempt to parse date string
try:
dateparser.parse(date)
return date
except ValueError:
pass

raise ValueError(f"Unable to parse input as date: {date}")
# Attempt to parse date string using dateparser
parsed_date = dateparser.parse(date)

if parsed_date is None:
# Remove timezone abbreviation in parentheses if present
date = re.sub(r'\s*\(.*\)$', '', date).strip()

# List of datetime formats to try
datetime_formats = [
'%m/%d/%Y %I:%M:%S %p', # 4/30/2024 09:00:02 AM
'%Y-%m-%dT%H:%M:%S', # 2024-04-30T09:00:02
'%Y-%m-%d %H:%M:%S', # 2024-04-30 09:00:02
'%B %d, %Y - %I:%M%p', # May 14, 2024 - 2:00pm
'%m/%d/%Y', # 4/30/2024
]

# Attempt to parse using datetime with specific formats
for date_format in datetime_formats:
try:
parsed_date = datetime.strptime(date, date_format)
break
except ValueError:
continue

if parsed_date is None:
raise ValueError(f"Unable to parse input as date: {date}")

# Return the date in ISO 8601 format
return parsed_date.isoformat()
22 changes: 18 additions & 4 deletions harambe/parser/type_phone_number.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,37 @@
from pydantic.functional_validators import AfterValidator
import re

import phonenumbers
from pydantic.functional_validators import AfterValidator
from typing_extensions import Annotated

phone_number_formats = [
r"^\d{3,11}$", # 911 & 11111111111
r"^\d{3}[\s.-]?\d{4}$", # 456-7890
r"^(\(?\d{1,3}\)?[\s.-])?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}$", # +1 (628) 555-3456 & (+1) 415-155-1555
r"^\(\d{1,3}\)\s\d{10}(\s\(Extension:\s\d{1,4}\))?$", # (+4) 1111111111 (Extension: 323)
r"^(\(?\d{1,3}\)?\s)?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}(,\s\(?ext\.\s\d{1,4}\)?)?$", # 206-555-7115 & 206-555-7115, ext. 239
r"^(\(?\d{1,3}\)?\s)?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}(,\s\(?ext\.\s\d{1,4}\)?)?$", # 206-555-7115 & 206-555-7115, ext. 239
]


class ParserTypePhoneNumber:
def __new__(self):
return Annotated[str, AfterValidator(self.validate_type)]
def __new__(cls):
return Annotated[str, AfterValidator(cls.validate_type)]

@staticmethod
def validate_type(number: str) -> str:
# Trim whitespaces
formatted_number = number.strip()

# First, try using the phonenumbers library
try:
phone_number = phonenumbers.parse(formatted_number, None) # 'None' implies no specific region
if phonenumbers.is_valid_number(phone_number):
# Return the phone number in international format
return phonenumbers.format_number(phone_number, phonenumbers.PhoneNumberFormat.INTERNATIONAL)
except phonenumbers.phonenumberutil.NumberParseException:
pass

# If phonenumbers library fails, fall back to regex validation
# Remove plus sign
formatted_number = number.replace("+", "")
# Attempt to parse phone number
Expand Down
15 changes: 8 additions & 7 deletions harambe/parser/type_url.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from pydantic.functional_validators import AfterValidator
from typing import Optional
from typing_extensions import Annotated
import urllib.parse
from urllib.parse import urljoin, urlparse

from pydantic.functional_validators import AfterValidator
from typing_extensions import Annotated

from harambe.types import URL

allowed_url_schemes = [
Expand All @@ -17,9 +17,10 @@


class ParserTypeUrl:
def __new__(self, base_url: Optional[URL] = None):
return Annotated[URL, AfterValidator(self.validate_type(base_url))]
def __new__(cls, base_url: Optional[URL] = None):
return Annotated[str, AfterValidator(cls.validate_type(base_url))]

@staticmethod
def validate_type(base_url: Optional[URL]):
def _validate_type(url: URL) -> str:
# Transform relative URLs into absolute using base_url
Expand All @@ -29,13 +30,13 @@ def _validate_type(url: URL) -> str:

# Parse the URL
try:
parsed_url = urllib.parse.urlparse(url)
parsed_url = urlparse(url)
except ValueError as e:
raise ValueError(f"Unable to parse URL: {url}", e)

# Check if the scheme is allowed
if parsed_url.scheme not in allowed_url_schemes:
raise ValueError(f"Invalid URL: {url}")
raise ValueError(f"Invalid URL scheme: {url}")

return url

Expand Down
15 changes: 13 additions & 2 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ requests = "^2.32.3"
playwright-stealth = "^1.0.6" # TODO: self host this package
aiohttp = "^3.9.5"
email-validator = "^2.2.0"
phonenumbers = "^8.13.39"

[tool.poetry.group.dev.dependencies]
ruff = "^0.4.10"
Expand Down
57 changes: 57 additions & 0 deletions tests/parser/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,3 +130,60 @@
"description": "Purely to cause error in the test",
},
}

datetime_schema = {
"event": {
"type": "object",
"properties": {
"name": {"type": "string", "description": "The name of the event"},
"date": {"type": "datetime", "description": "The date of the event"},
},
}
}

phone_number_schema = {
"contact": {
"type": "object",
"properties": {
"name": {"type": "string", "description": "The name of the contact"},
"phone": {"type": "phone_number", "description": "The phone number"},
},
}
}

url_schema = {
"resource": {
"type": "object",
"properties": {
"name": {"type": "string", "description": "The name of the resource"},
"link": {"type": "url", "description": "A link to the resource"},
},
}
}

object_with_nested_types_schema = {
"profile": {
"type": "object",
"properties": {
"user": {"type": "string", "description": "Username"},
"contact": {"type": "phone_number", "description": "Contact number"},
"event_date": {"type": "datetime", "description": "Event date"},
"website": {"type": "url", "description": "Website URL"},
},
}
}

list_with_nested_types_schema = {
"events": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {"type": "string"},
"dates": {"type": "array", "items": {"type": "datetime"}},
"contacts": {"type": "array", "items": {"type": "phone_number"}},
"links": {"type": "array", "items": {"type": "url"}},
},
},
}
}
Loading
Loading