Skip to content

Commit

Permalink
Merge pull request #100 from reworkd/s3-scheme
Browse files Browse the repository at this point in the history
✨ Don't normalize S3 urls
  • Loading branch information
asim-shrestha authored Dec 13, 2024
2 parents 349e75c + 03ae49d commit aadf18c
Show file tree
Hide file tree
Showing 7 changed files with 28 additions and 8 deletions.
14 changes: 12 additions & 2 deletions core/harambe_core/normalize_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,12 @@ def normalize_url(path: str, base_path: str | None) -> str:
"""
path = sanitize_scheme(path)
validate_allowed_scheme(path)
path = _normalize(path)
if not is_s3_url(path):
# We append actual URLs at the end of S3 urls occasionally
# Normalization will turn https:// into http:/
# TODO: When we handle dynamic downloads in our worker, we need to remove this logic
# we should also remove s3 as an allowed scheme all together
path = _normalize(path)
escaped_path = path.replace(" ", "%20")

if base_path is None:
Expand Down Expand Up @@ -45,7 +50,12 @@ def sanitize_scheme(url: str) -> str:
return base + url[last_scheme_index + 1 :] if last_scheme_index > 0 else url


allowed_url_schemes = ["http", "https", "s3", "file"]
s3_scheme = "s3"
allowed_url_schemes = ["http", "https", s3_scheme, "file"]


def is_s3_url(url: str) -> bool:
return urlparse(url).scheme == s3_scheme


def validate_allowed_scheme(url: str, scheme_required: bool = False) -> None:
Expand Down
2 changes: 1 addition & 1 deletion core/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "harambe-core"
version = "0.55.0"
version = "0.56.0"
description = "Core types for harambe SDK 🐒🍌"
authors = [
{ name = "Adam Watkins", email = "[email protected]" }
Expand Down
5 changes: 5 additions & 0 deletions core/test/parser/test_type_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,11 @@
"https://example.com",
"https://example.com/doc1",
),
(
"s3://deworkd-local-files/5a9a3ac5-a572-494d-a888-8a065e6e3878/579c900dbfb97dae2fbf329cf6b3411a.pdf;https://example.com/",
"https://www.example.com",
"s3://deworkd-local-files/5a9a3ac5-a572-494d-a888-8a065e6e3878/579c900dbfb97dae2fbf329cf6b3411a.pdf;https://example.com/",
),
],
)
def test_pydantic_type_url_validate_type_success(url, base_url_, expected):
Expand Down
5 changes: 5 additions & 0 deletions core/test/test_normalize_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,11 @@
"//primoliquors.com/cdn/shop/files/oxcuimdzbczobdeo248f.png?v=1727810919&width=1946",
"https://primoliquors.com/cdn/shop/files/oxcuimdzbczobdeo248f.png?v=1727810919&width=1946",
),
(
"https://www.example.com",
"s3://deworkd-local-files/5a9a3ac5-a572-494d-a888-8a065e6e3878/579c900dbfb97dae2fbf329cf6b3411a.pdf;https://example.com/",
"s3://deworkd-local-files/5a9a3ac5-a572-494d-a888-8a065e6e3878/579c900dbfb97dae2fbf329cf6b3411a.pdf;https://example.com/",
),
],
)
def test_normalize_url(base_path, url, expected):
Expand Down
2 changes: 1 addition & 1 deletion core/uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions sdk/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
[project]
name = "harambe-sdk"
version = "0.55.0"
version = "0.56.0"
description = "Data extraction SDK for Playwright 🐒🍌"
authors = [
{ name = "Adam Watkins", email = "[email protected]" }
]
requires-python = ">=3.11,<4.0"
readme = "README.md"
dependencies = [
"harambe_core==0.55.0",
"harambe_core==0.56.0",
"playwright==1.47.0",
"beautifulsoup4==4.12.3",
"requests==2.32.3",
Expand Down
4 changes: 2 additions & 2 deletions sdk/uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit aadf18c

Please sign in to comment.