From dcd9119cd2c5ae0d2909405e27523ee8df2ec80d Mon Sep 17 00:00:00 2001 From: Austin SeungJun Park <110667795+eagle-25@users.noreply.github.com> Date: Tue, 17 Dec 2024 17:57:03 +0900 Subject: [PATCH] fix(ingest/s3): incorrectly parsing path in s3_uri (#12135) --- metadata-ingestion/src/datahub/ingestion/source/s3/source.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py index 1863663f98bb2..3ddf47b70cdf8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py @@ -9,6 +9,7 @@ from itertools import groupby from pathlib import PurePath from typing import Any, Dict, Iterable, List, Optional, Tuple +from urllib.parse import urlparse import smart_open.compression as so_compression from more_itertools import peekable @@ -993,9 +994,7 @@ def s3_browser(self, path_spec: PathSpec, sample_size: int) -> Iterable[BrowsePa folders = [] for dir in dirs_to_process: logger.info(f"Getting files from folder: {dir}") - prefix_to_process = dir.rstrip("\\").lstrip( - self.create_s3_path(bucket_name, "/") - ) + prefix_to_process = urlparse(dir).path.lstrip("/") folders.extend( self.get_folder_info(