From 561d655a9d3eaef0f421f4eb2b82d35b61735045 Mon Sep 17 00:00:00 2001 From: asim-shrestha Date: Fri, 13 Dec 2024 14:12:20 -0800 Subject: [PATCH 1/2] =?UTF-8?q?=E2=9C=A8=20Don't=20normalize=20S3=20urls?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- core/harambe_core/normalize_url.py | 12 ++++++++++-- core/pyproject.toml | 2 +- core/test/parser/test_type_url.py | 5 +++++ core/test/test_normalize_url.py | 5 +++++ core/uv.lock | 2 +- sdk/pyproject.toml | 4 ++-- sdk/uv.lock | 4 ++-- 7 files changed, 26 insertions(+), 8 deletions(-) diff --git a/core/harambe_core/normalize_url.py b/core/harambe_core/normalize_url.py index 8b6c28f..8ebc4aa 100644 --- a/core/harambe_core/normalize_url.py +++ b/core/harambe_core/normalize_url.py @@ -11,7 +11,10 @@ def normalize_url(path: str, base_path: str | None) -> str: """ path = sanitize_scheme(path) validate_allowed_scheme(path) - path = _normalize(path) + if not is_s3_url(path): + # We append actual URLs at the end of S3 urls occasionally + # Normalization will turn https:// into http:/ + path = _normalize(path) escaped_path = path.replace(" ", "%20") if base_path is None: @@ -45,7 +48,12 @@ def sanitize_scheme(url: str) -> str: return base + url[last_scheme_index + 1 :] if last_scheme_index > 0 else url -allowed_url_schemes = ["http", "https", "s3", "file"] +s3_scheme = "s3" +allowed_url_schemes = ["http", "https", s3_scheme, "file"] + + +def is_s3_url(url: str) -> bool: + return urlparse(url).scheme == s3_scheme def validate_allowed_scheme(url: str, scheme_required: bool = False) -> None: diff --git a/core/pyproject.toml b/core/pyproject.toml index 281401a..47259d2 100644 --- a/core/pyproject.toml +++ b/core/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "harambe-core" -version = "0.55.0" +version = "0.56.0" description = "Core types for harambe SDK 🐒🍌" authors = [ { name = "Adam Watkins", email = "adam@reworkd.ai" } diff --git a/core/test/parser/test_type_url.py b/core/test/parser/test_type_url.py index f09ff54..487d429 100644 --- a/core/test/parser/test_type_url.py +++ b/core/test/parser/test_type_url.py @@ -61,6 +61,11 @@ "https://example.com", "https://example.com/doc1", ), + ( + "s3://deworkd-local-files/5a9a3ac5-a572-494d-a888-8a065e6e3878/579c900dbfb97dae2fbf329cf6b3411a.pdf;https://example.com/", + "https://www.example.com", + "s3://deworkd-local-files/5a9a3ac5-a572-494d-a888-8a065e6e3878/579c900dbfb97dae2fbf329cf6b3411a.pdf;https://example.com/", + ), ], ) def test_pydantic_type_url_validate_type_success(url, base_url_, expected): diff --git a/core/test/test_normalize_url.py b/core/test/test_normalize_url.py index 802248e..8e059f5 100644 --- a/core/test/test_normalize_url.py +++ b/core/test/test_normalize_url.py @@ -112,6 +112,11 @@ "//primoliquors.com/cdn/shop/files/oxcuimdzbczobdeo248f.png?v=1727810919&width=1946", "https://primoliquors.com/cdn/shop/files/oxcuimdzbczobdeo248f.png?v=1727810919&width=1946", ), + ( + "https://www.example.com", + "s3://deworkd-local-files/5a9a3ac5-a572-494d-a888-8a065e6e3878/579c900dbfb97dae2fbf329cf6b3411a.pdf;https://example.com/", + "s3://deworkd-local-files/5a9a3ac5-a572-494d-a888-8a065e6e3878/579c900dbfb97dae2fbf329cf6b3411a.pdf;https://example.com/", + ), ], ) def test_normalize_url(base_path, url, expected): diff --git a/core/uv.lock b/core/uv.lock index c0019ef..dd5a038 100644 --- a/core/uv.lock +++ b/core/uv.lock @@ -115,7 +115,7 @@ wheels = [ [[package]] name = "harambe-core" -version = "0.55.0" +version = "0.56.0" source = { virtual = "." } dependencies = [ { name = "dateparser" }, diff --git a/sdk/pyproject.toml b/sdk/pyproject.toml index 0bea7ca..2feaca4 100644 --- a/sdk/pyproject.toml +++ b/sdk/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "harambe-sdk" -version = "0.55.0" +version = "0.56.0" description = "Data extraction SDK for Playwright 🐒🍌" authors = [ { name = "Adam Watkins", email = "adam@reworkd.ai" } @@ -8,7 +8,7 @@ authors = [ requires-python = ">=3.11,<4.0" readme = "README.md" dependencies = [ - "harambe_core==0.55.0", + "harambe_core==0.56.0", "playwright==1.47.0", "beautifulsoup4==4.12.3", "requests==2.32.3", diff --git a/sdk/uv.lock b/sdk/uv.lock index c20908b..2f51499 100644 --- a/sdk/uv.lock +++ b/sdk/uv.lock @@ -428,7 +428,7 @@ wheels = [ [[package]] name = "harambe-core" -version = "0.55.0" +version = "0.56.0" source = { editable = "../core" } dependencies = [ { name = "dateparser" }, @@ -459,7 +459,7 @@ dev = [ [[package]] name = "harambe-sdk" -version = "0.55.0" +version = "0.56.0" source = { virtual = "." } dependencies = [ { name = "aiohttp" }, From 03ae49d9fe6db5b5735b33c94d1d5cf680fa3288 Mon Sep 17 00:00:00 2001 From: asim-shrestha Date: Fri, 13 Dec 2024 14:17:22 -0800 Subject: [PATCH 2/2] =?UTF-8?q?=E2=9C=A8=20Don't=20normalize=20S3=20urls?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- core/harambe_core/normalize_url.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/core/harambe_core/normalize_url.py b/core/harambe_core/normalize_url.py index 8ebc4aa..e18e57b 100644 --- a/core/harambe_core/normalize_url.py +++ b/core/harambe_core/normalize_url.py @@ -14,6 +14,8 @@ def normalize_url(path: str, base_path: str | None) -> str: if not is_s3_url(path): # We append actual URLs at the end of S3 urls occasionally # Normalization will turn https:// into http:/ + # TODO: When we handle dynamic downloads in our worker, we need to remove this logic + # we should also remove s3 as an allowed scheme all together path = _normalize(path) escaped_path = path.replace(" ", "%20")