From 080a9924166c49e8d1fb169e750e9d57daa10797 Mon Sep 17 00:00:00 2001 From: Oscar SeungJun Park Date: Sun, 15 Dec 2024 20:29:43 +0900 Subject: [PATCH] fix(ingest/s3): incorrectly parsing path in s3_uri --- metadata-ingestion/src/datahub/ingestion/source/s3/source.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py index 1863663f98bb24..3ddf47b70cdf80 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py @@ -9,6 +9,7 @@ from itertools import groupby from pathlib import PurePath from typing import Any, Dict, Iterable, List, Optional, Tuple +from urllib.parse import urlparse import smart_open.compression as so_compression from more_itertools import peekable @@ -993,9 +994,7 @@ def s3_browser(self, path_spec: PathSpec, sample_size: int) -> Iterable[BrowsePa folders = [] for dir in dirs_to_process: logger.info(f"Getting files from folder: {dir}") - prefix_to_process = dir.rstrip("\\").lstrip( - self.create_s3_path(bucket_name, "/") - ) + prefix_to_process = urlparse(dir).path.lstrip("/") folders.extend( self.get_folder_info(