From 91dc524dae4f7b6f6dc28bf2ea01d068bb400860 Mon Sep 17 00:00:00 2001 From: John Wilkie Date: Tue, 27 Aug 2024 22:54:39 +0100 Subject: [PATCH 1/5] Added to CLI & SDK + created MultiFileItem parsing & layout building --- darwin/cli.py | 1 + darwin/cli_functions.py | 9 ++ darwin/dataset/remote_dataset.py | 1 + darwin/dataset/remote_dataset_v2.py | 134 ++++++++++++++++---- darwin/dataset/upload_manager.py | 32 +++++ darwin/options.py | 6 + tests/darwin/dataset/remote_dataset_test.py | 14 ++ 7 files changed, 169 insertions(+), 28 deletions(-) diff --git a/darwin/cli.py b/darwin/cli.py index bf493341a..c48f55e38 100644 --- a/darwin/cli.py +++ b/darwin/cli.py @@ -126,6 +126,7 @@ def _run(args: Namespace, parser: ArgumentParser) -> None: args.extract_views, args.preserve_folders, args.verbose, + args.item_merge_mode, ) # Remove a project (remotely) elif args.action == "remove": diff --git a/darwin/cli_functions.py b/darwin/cli_functions.py index 5d01d7a9d..f2b11999d 100644 --- a/darwin/cli_functions.py +++ b/darwin/cli_functions.py @@ -656,6 +656,7 @@ def upload_data( extract_views: bool = False, preserve_folders: bool = False, verbose: bool = False, + item_merge_mode: Optional[str] = None, ) -> None: """ Uploads the provided files to the remote dataset. @@ -684,6 +685,13 @@ def upload_data( Specify whether or not to preserve folder paths when uploading. verbose : bool Specify whether to have full traces print when uploading files or not. + item_merge_mode : Optional[str] + If set, each file path passed to `files_to_upload` behaves as follows: + - Every path that points directly to a file is ignored + - Each folder of files passed to `files_to_upload` will be uploaded according to the following modes: + - "slots": Each file in the folder will be uploaded to a different slot of the same item. + - "series": All `.dcm` files in the folder will be concatenated into a single slot. All other files are ignored. + - "channels": Each file in the folder will be uploaded to a different channel of the same item. """ client: Client = _load_client() try: @@ -773,6 +781,7 @@ def file_upload_callback( preserve_folders=preserve_folders, progress_callback=progress_callback, file_upload_callback=file_upload_callback, + item_merge_mode=item_merge_mode, ) console = Console(theme=_console_theme()) diff --git a/darwin/dataset/remote_dataset.py b/darwin/dataset/remote_dataset.py index 154c26c85..34f642912 100644 --- a/darwin/dataset/remote_dataset.py +++ b/darwin/dataset/remote_dataset.py @@ -138,6 +138,7 @@ def push( preserve_folders: bool = False, progress_callback: Optional[ProgressCallback] = None, file_upload_callback: Optional[FileUploadCallback] = None, + item_merge_mode: Optional[str] = None, ) -> UploadHandler: pass diff --git a/darwin/dataset/remote_dataset_v2.py b/darwin/dataset/remote_dataset_v2.py index 59f87b320..72241785f 100644 --- a/darwin/dataset/remote_dataset_v2.py +++ b/darwin/dataset/remote_dataset_v2.py @@ -18,7 +18,9 @@ from darwin.dataset.release import Release from darwin.dataset.upload_manager import ( FileUploadCallback, + ItemMergeMode, LocalFile, + MultiFileItem, ProgressCallback, UploadHandler, UploadHandlerV2, @@ -166,6 +168,7 @@ def push( preserve_folders: bool = False, progress_callback: Optional[ProgressCallback] = None, file_upload_callback: Optional[FileUploadCallback] = None, + item_merge_mode: Optional[str] = None, ) -> UploadHandler: """ Uploads a local dataset (images ONLY) in the datasets directory. @@ -173,7 +176,8 @@ def push( Parameters ---------- files_to_upload : Optional[List[Union[PathLike, LocalFile]]] - List of files to upload. Those can be folders. + List of files to upload. These can be folders. + If `item_merge_mode` is set, these must be folders. blocking : bool, default: True If False, the dataset is not uploaded and a generator function is returned instead. multi_threaded : bool, default: True @@ -188,7 +192,7 @@ def push( extract_views: bool, default: False When the uploading file is a volume, specify whether it's going to be split into orthogonal views. files_to_exclude : Optional[PathLike]], default: None - Optional list of files to exclude from the file scan. Those can be folders. + Optional list of files to exclude from the file scan. These can be folders. path: Optional[str], default: None Optional path to store the files in. preserve_folders : bool, default: False @@ -197,6 +201,13 @@ def push( Optional callback, called every time the progress of an uploading files is reported. file_upload_callback: Optional[FileUploadCallback], default: None Optional callback, called every time a file chunk is uploaded. + item_merge_mode: Optional[str], default: None + If set, each file path passed to `files_to_upload` behaves as follows: + - Every path that points directly to a file is ignored + - Each folder of files passed to `files_to_upload` will be uploaded according to the following modes: + - "slots": Each file in the folder will be uploaded to a different slot of the same item. + - "series": All `.dcm` files in the folder will be concatenated into a single slot. All other files are ignored. + - "channels": Each file in the folder will be uploaded to a different channel of the same item. Returns ------- @@ -216,39 +227,41 @@ def push( if files_to_upload is None: raise ValueError("No files or directory specified.") + if item_merge_mode: + try: + item_merge_mode = ItemMergeMode(item_merge_mode) + except ValueError: + raise ValueError( + f"Invalid item merge mode: {item_merge_mode}. Valid options are: 'slots', 'series', 'channels" + ) + + if item_merge_mode and preserve_folders: + raise TypeError("Cannot use `preserve_folders` with `item_merge_mode`") + + # Direct file paths uploading_files = [ item for item in files_to_upload if isinstance(item, LocalFile) ] + + # Folder paths search_files = [ item for item in files_to_upload if not isinstance(item, LocalFile) ] - generic_parameters_specified = ( - path is not None or fps != 0 or as_frames is not False - ) - if uploading_files and generic_parameters_specified: - raise ValueError("Cannot specify a path when uploading a LocalFile object.") - - for found_file in find_files(search_files, files_to_exclude=files_to_exclude): - local_path = path - if preserve_folders: - source_files = [ - source_file - for source_file in search_files - if is_relative_to(found_file, source_file) - ] - if source_files: - local_path = str( - found_file.relative_to(source_files[0]).parent.as_posix() - ) - uploading_files.append( - LocalFile( - found_file, - fps=fps, - as_frames=as_frames, - extract_views=extract_views, - path=local_path, - ) + if item_merge_mode: + uploading_files = find_files_to_upload_merging( + search_files, files_to_exclude, item_merge_mode + ) + else: + uploading_files = find_files_to_upload_no_merging( + search_files, + files_to_exclude, + path, + fps, + as_frames, + extract_views, + preserve_folders, + uploading_files, ) if not uploading_files: @@ -842,3 +855,68 @@ def register_multi_slotted( print(f" - {item}") print(f"Reistration complete. Check your items in the dataset: {self.slug}") return results + + +def find_files_to_upload_merging( + search_files: List[PathLike], + files_to_exclude: Optional[List[PathLike]], + item_merge_mode: str, +) -> List[MultiFileItem]: + multi_file_items = [] + for directory in search_files: + files_in_directory = list( + find_files([directory], files_to_exclude=files_to_exclude, recursive=False) + ) + if not files_in_directory: + print( + f"Warning: There are no uploading files in the first level of {directory}, skipping" + ) + continue + multi_file_items.append( + MultiFileItem(directory, files_in_directory, item_merge_mode) + ) + if not multi_file_items: + raise ValueError( + "No valid folders to upload after searching the passed directories for files" + ) + return multi_file_items + + +def find_files_to_upload_no_merging( + search_files: List[PathLike], + files_to_exclude: Optional[List[PathLike]], + path: Optional[str], + fps: int, + as_frames: bool, + extract_views: bool, + preserve_folders: bool, + uploading_files: List[LocalFile], +) -> List[LocalFile]: + generic_parameters_specified = ( + path is not None or fps != 0 or as_frames is not False + ) + if uploading_files and generic_parameters_specified: + raise ValueError("Cannot specify a path when uploading a LocalFile object.") + + for found_file in find_files(search_files, files_to_exclude=files_to_exclude): + local_path = path + if preserve_folders: + source_files = [ + source_file + for source_file in search_files + if is_relative_to(found_file, source_file) + ] + if source_files: + local_path = str( + found_file.relative_to(source_files[0]).parent.as_posix() + ) + uploading_files.append( + LocalFile( + found_file, + fps=fps, + as_frames=as_frames, + extract_views=extract_views, + path=local_path, + ) + ) + return uploading_files diff --git a/darwin/dataset/upload_manager.py b/darwin/dataset/upload_manager.py index bcd28f8f7..53fad692c 100644 --- a/darwin/dataset/upload_manager.py +++ b/darwin/dataset/upload_manager.py @@ -2,6 +2,7 @@ import os import time from dataclasses import dataclass +from enum import Enum from pathlib import Path from typing import ( TYPE_CHECKING, @@ -31,6 +32,12 @@ from typing import Dict +class ItemMergeMode(Enum): + SLOTS = "slots" + SERIES = "series" + CHANNELS = "channels" + + class ItemPayload: """ Represents an item's payload. @@ -186,6 +193,31 @@ def full_path(self) -> str: return construct_full_path(self.data["path"], self.data["filename"]) +class MultiFileItem: + def __init__( + self, directory: PathLike, files: List[PathLike], merge_mode: ItemMergeMode + ): + self.directory = directory + self.files = files + self.merge_mode = merge_mode + self.layout = self._create_layout() + self.temp = {"version": 2, "slots": ["1", "2", "3"], "type": "grid"} + + def _create_layout(self): + # TODO + if ( + self.merge_mode == ItemMergeMode.slots + or self.merge_mode == ItemMergeMode.series + ): + return { + "version": 2, + "slots": [str(i) for i in range(len(self.files))], + "type": "grid", # Worth experimenting with - Is this the best option? Should we change this dynamically? + } + else: + return {"version": 3, "slots_grid": [[[file.name for file in self.files]]]} + + class FileMonitor(object): """ Monitors the progress of a :class:``BufferedReader``. diff --git a/darwin/options.py b/darwin/options.py index b6a292394..996c27b3a 100644 --- a/darwin/options.py +++ b/darwin/options.py @@ -183,6 +183,12 @@ def __init__(self) -> None: action="store_true", help="Preserve the local folder structure in the dataset.", ) + parser_push.add_argument( + "--item-merge-mode", + type=str, + choices=["slots", "series", "channels"], + help="Specify the item merge mode: slots, series, or channels", + ) # Remove parser_remove = dataset_action.add_parser( diff --git a/tests/darwin/dataset/remote_dataset_test.py b/tests/darwin/dataset/remote_dataset_test.py index c974ace0e..b0cf10276 100644 --- a/tests/darwin/dataset/remote_dataset_test.py +++ b/tests/darwin/dataset/remote_dataset_test.py @@ -659,6 +659,20 @@ def test_raises_with_unsupported_files(self, remote_dataset: RemoteDataset): with pytest.raises(UnsupportedFileType): remote_dataset.push(["test.txt"]) + def test_raises_if_invalid_item_merge_mode(self, remote_dataset: RemoteDataset): + with pytest.raises(ValueError): + remote_dataset.push(["path/to/dir"], item_merge_mode="invalid") + + def test_raises_if_preserve_folders_with_item_merge_mode( + self, remote_dataset: RemoteDataset + ): + with pytest.raises(TypeError): + remote_dataset.push( + ["path/to/dir"], + item_merge_mode="slots", + preserve_folders=True, + ) + @pytest.mark.usefixtures("file_read_write_test") class TestPull: From cd806a9bde32159bcfb97b008c757f251d7758be Mon Sep 17 00:00:00 2001 From: John Wilkie Date: Thu, 29 Aug 2024 18:00:39 +0100 Subject: [PATCH 2/5] Unit test coverage --- darwin/cli_functions.py | 3 +- darwin/dataset/remote_dataset_v2.py | 82 +++++++++-- darwin/dataset/upload_manager.py | 48 +++++-- tests/darwin/data/push_test_dir.zip | Bin 0 -> 24374 bytes tests/darwin/dataset/remote_dataset_test.py | 146 +++++++++++++++++++- 5 files changed, 250 insertions(+), 29 deletions(-) create mode 100644 tests/darwin/data/push_test_dir.zip diff --git a/darwin/cli_functions.py b/darwin/cli_functions.py index f2b11999d..ce00f6a54 100644 --- a/darwin/cli_functions.py +++ b/darwin/cli_functions.py @@ -688,7 +688,8 @@ def upload_data( item_merge_mode : Optional[str] If set, each file path passed to `files_to_upload` behaves as follows: - Every path that points directly to a file is ignored - - Each folder of files passed to `files_to_upload` will be uploaded according to the following modes: + - Each folder of files passed to `files_to_upload` will be uploaded according to the following mode rules. + Note that folders will not be recursively searched, so only files in the first level of the folder will be uploaded: - "slots": Each file in the folder will be uploaded to a different slot of the same item. - "series": All `.dcm` files in the folder will be concatenated into a single slot. All other files are ignored. - "channels": Each file in the folder will be uploaded to a different channel of the same item. diff --git a/darwin/dataset/remote_dataset_v2.py b/darwin/dataset/remote_dataset_v2.py index 72241785f..eecc725d9 100644 --- a/darwin/dataset/remote_dataset_v2.py +++ b/darwin/dataset/remote_dataset_v2.py @@ -1,4 +1,5 @@ import json +from pathlib import Path from typing import ( TYPE_CHECKING, Any, @@ -204,7 +205,8 @@ def push( item_merge_mode: Optional[str], default: None If set, each file path passed to `files_to_upload` behaves as follows: - Every path that points directly to a file is ignored - - Each folder of files passed to `files_to_upload` will be uploaded according to the following modes: + - Each folder of files passed to `files_to_upload` will be uploaded according to the following mode rules. + Note that folders will not be recursively searched, so only files in the first level of the folder will be uploaded: - "slots": Each file in the folder will be uploaded to a different slot of the same item. - "series": All `.dcm` files in the folder will be concatenated into a single slot. All other files are ignored. - "channels": Each file in the folder will be uploaded to a different channel of the same item. @@ -229,7 +231,7 @@ def push( if item_merge_mode: try: - item_merge_mode = ItemMergeMode(item_merge_mode) + ItemMergeMode(item_merge_mode) except ValueError: raise ValueError( f"Invalid item merge mode: {item_merge_mode}. Valid options are: 'slots', 'series', 'channels" @@ -249,11 +251,11 @@ def push( ] if item_merge_mode: - uploading_files = find_files_to_upload_merging( + uploading_files = _find_files_to_upload_merging( search_files, files_to_exclude, item_merge_mode ) else: - uploading_files = find_files_to_upload_no_merging( + uploading_files = _find_files_to_upload_no_merging( search_files, files_to_exclude, path, @@ -264,11 +266,6 @@ def push( uploading_files, ) - if not uploading_files: - raise ValueError( - "No files to upload, check your path, exclusion filters and resume flag" - ) - handler = UploadHandlerV2(self, uploading_files) if blocking: handler.upload( @@ -857,11 +854,33 @@ def register_multi_slotted( return results -def find_files_to_upload_merging( +def _find_files_to_upload_merging( search_files: List[PathLike], - files_to_exclude: Optional[List[PathLike]], + files_to_exclude: List[PathLike], item_merge_mode: str, ) -> List[MultiFileItem]: + """ + Finds files to upload as either: + - Multi-slotted items + - Multi-channel items + - Single-slotted items containing multiple `.dcm` files + + Does not search each directory recursively, only considers files in the first level of each directory. + + Parameters + ---------- + search_files : List[PathLike] + List of directories to search for files. + files_to_exclude : List[PathLike] + List of files to exclude from the file scan. + item_merge_mode : str + Mode to merge the files in the folders. Valid options are: 'slots', 'series', 'channels'. + + Returns + ------- + List[MultiFileItem] + List of files to upload. + """ multi_file_items = [] for directory in search_files: files_in_directory = list( @@ -873,7 +892,9 @@ def find_files_to_upload_merging( ) continue multi_file_items.append( - MultiFileItem(directory, files_in_directory, item_merge_mode) + MultiFileItem( + Path(directory), files_in_directory, ItemMergeMode(item_merge_mode) + ) ) if not multi_file_items: raise ValueError( @@ -882,9 +903,9 @@ def find_files_to_upload_merging( return multi_file_items -def find_files_to_upload_no_merging( +def _find_files_to_upload_no_merging( search_files: List[PathLike], - files_to_exclude: Optional[List[PathLike]], + files_to_exclude: List[PathLike], path: Optional[str], fps: int, as_frames: bool, @@ -892,6 +913,33 @@ def find_files_to_upload_no_merging( preserve_folders: bool, uploading_files: List[LocalFile], ) -> List[LocalFile]: + """ + Finds files to upload as single-slotted dataset items. Recursively searches the passed directories for files. + + Parameters + ---------- + search_files : List[PathLike] + List of directories to search for files. + files_to_exclude : Optional[List[PathLike]] + List of files to exclude from the file scan. + path : Optional[str] + Path to store the files in. + fps: int + When uploading video files, specify the framerate. + as_frames: bool + When uploading video files, specify whether to upload as a list of frames. + extract_views: bool + When uploading volume files, specify whether to split into orthogonal views. + preserve_folders: bool + Specify whether or not to preserve folder paths when uploading. + uploading_files : List[LocalFile] + List of files to upload. + + Returns + ------- + List[LocalFile] + List of files to upload. + """ generic_parameters_specified = ( path is not None or fps != 0 or as_frames is not False ) @@ -919,4 +967,10 @@ def find_files_to_upload_no_merging( path=local_path, ) ) + + if not uploading_files: + raise ValueError( + "No files to upload, check your path, exclusion filters and resume flag" + ) + return uploading_files diff --git a/darwin/dataset/upload_manager.py b/darwin/dataset/upload_manager.py index 53fad692c..855d9e415 100644 --- a/darwin/dataset/upload_manager.py +++ b/darwin/dataset/upload_manager.py @@ -14,6 +14,7 @@ Optional, Set, Tuple, + Union, ) import requests @@ -194,27 +195,46 @@ def full_path(self) -> str: class MultiFileItem: - def __init__( - self, directory: PathLike, files: List[PathLike], merge_mode: ItemMergeMode - ): + def __init__(self, directory: Path, files: List[Path], merge_mode: ItemMergeMode): self.directory = directory + self.name = directory.name self.files = files self.merge_mode = merge_mode self.layout = self._create_layout() - self.temp = {"version": 2, "slots": ["1", "2", "3"], "type": "grid"} def _create_layout(self): - # TODO - if ( - self.merge_mode == ItemMergeMode.slots - or self.merge_mode == ItemMergeMode.series - ): + """ + Creates the layout to be used when uploading the files: + - For multi-slotted items: LayoutV2 + - For series items: LayoutV2, but only with `.dcm` files + - For multi-channel items: LayoutV3 + + Raises + ------ + ValueError + - If no DICOM files are found in the directory for ItemMergeMode.SEIRES items + - If the number of files is greater than 16 for ItemMergeMode.CHANNELS items + """ + if self.merge_mode == ItemMergeMode.SLOTS: + return { + "version": 2, + "slots": [str(i) for i in range(len(self.files))], + "type": "grid", + } + elif self.merge_mode == ItemMergeMode.SERIES: + self.files = [file for file in self.files if file.suffix.lower() == ".dcm"] + if not self.files: + raise ValueError("No DICOM files found in 1st level of directory") return { "version": 2, "slots": [str(i) for i in range(len(self.files))], - "type": "grid", # Worth experimenting with - Is this the best option? Should we change this dynamically? + "type": "grid", } - else: + elif self.merge_mode == ItemMergeMode.CHANNELS: + if len(self.files) > 16: + raise ValueError( + f"No multi-channel item can have more than 16 channels. The following directory has {len(self.files)} files: {self.directory}" + ) return {"version": 3, "slots_grid": [[[file.name for file in self.files]]]} @@ -434,7 +454,11 @@ def _upload_file( class UploadHandlerV2(UploadHandler): - def __init__(self, dataset: "RemoteDataset", local_files: List[LocalFile]): + def __init__( + self, + dataset: "RemoteDataset", + local_files: Union[List[LocalFile], List[MultiFileItem]], + ): super().__init__(dataset=dataset, local_files=local_files) def _request_upload(self) -> Tuple[List[ItemPayload], List[ItemPayload]]: diff --git a/tests/darwin/data/push_test_dir.zip b/tests/darwin/data/push_test_dir.zip new file mode 100644 index 0000000000000000000000000000000000000000..2a09ce77336d274582539635428692163674766f GIT binary patch literal 24374 zcmd^{c|4Tg_rOQi5E3GL*|#xcPo*qvh$2a8EZNs=5tXsEpv6av6sakNh*Gj;D$G#j{R97)^DPOeyAh~vpZ3kgsO8!! zj?RFFS{9GjLUZv#rpqnT0&LFn@wD-zyFDs-*Wgmm+BGu{+Y}od{+RPgP@J&FgTWK` zwtx9?LrGsV(;_WbVE>uVcH%TCG;!_wu`_Zk&6*kG`96anR0W zuPdUxHsT?3X;j*p7L5@ z+3CR@h=|A#jwv^DyYA&5R}Y1GIC=UUWxo9TAf{C&wI$T|u;Lt#s@S+6;tX5| zZbpln8?&-4mU^HRJcF#=WECD!ZfI|C?4#Z&b0#P^d9&b0c1z#MI_D~~azmeSiVbzh zPTngROim1Shujl7#0~F~(4;zN=mXmsE7a3}+URI|Kt_uj8EsAq;%G!eihmr>2(h`RR7w2jz8egcGV_6$56CKfbem>tm2KhOQn7&rl zmibG3V>#K*xcRB_d4r2hyY3D){?r@jP~XOhIisU64}-%P+6j7xTg*Np&s-&{)kUZ~+qbtb{GAO!nnV-ekLMee>LQ zXvgMtXNklzjNP&07d@D)SF+?qClE}R8JJmEW2w)$Vck_Bz^{MZ#9i!Y$M5K3=U~@m zYGT#JZMD}te8wR`Mp+(VhFoEu$bSB%wHS+LiL!7;?U*R0Wl8+1wr!9tVP)gkTc)i+ z9t$*{$xQ(tyn51M4`|%Je=Gyg_8o5+rQiqe=j0gZ=kFHeMDTaA_j7awvXp#eOF7Q3 zki%eLcP|bLzj}~^G0p~e+Sq#Xt>;_f;%e_4?zhP<{M_f&MyC@~zZABLU+)x%DfJmjuh%3-xW0+&uBi!^)ZBm++kFI@5nab9sx|6IS*53*YWW zv6iU$A2YfB$|{7I_#@%d%g~29+aHvC)b;QC!iTvmFZ-O4i6NYIL&8RL-bF>X!yWUE zZdp0V@W@Vv^}fkEwS;rk|29|d;TB&K*64g%u|Mg$7AN#Q@p3BnGMTd`Tt&P|DUNxy z?+lJ};)R_2UL3j8p)GUo`1>R&6PBbnyiUjJ-a7rD{-r@KTV?r%fA%{^exPgHhVmsv zW~@S%D~W3ZcHKN)>zd(q$##I>?{Q5{`okudF0~&kbXnrcxfJbNYtI|@hQI!-&Z)ey z|K!Rq7Dvjplb(CFS?NDLlJ)L%4ZFstcB{abf}VK4LQkXK{gBf}^o$sG7`L@8uMr%z``CKs;)go7`hjSmU*0iumSibKT z`=y(u3!F%5*>5ZI9CkMBz4gMcYtv1ULkA4?+(bMUn7RKW)b&P|?bgq*6;{tVOT-&p z6u*6z;F?+IW+oyQ$thQ-7yU^6lS#?N8+A>bGlZdB6H` z=T66d9a7$EzR*vPJ|Ep-y*|%2GQ)FLhuLv?qhsq!ZtCWoa9?M!u*c9w@*|Udlx1y_ zvhpE;Z={&7ccq`6)p@$3QOJE;zEtmB^%zZqYpnm=Wi4)Ujd+?6Q1(pSh5zS<4)iV(`az<=w(0`;CMzQ&ZN9)=f{^*v*!_h!!8Xf(kY(-X zv{kDgCmlaxU0rzH^9U)j_tbV-lg!0u+tf1@J9AmJo_qvq%J$Gc=T zEFFvT?_@Y_z4$0;NB)!B-2YbyQR%u1U=N@Z-?vu|FB z(l-rIzQ@WPYo{@I^akTkxx;v0-Zhm%+8iI74GLzYR($lGRk}yKX+{6cUD0oww)Zps zUS7p6Wxa2Gf#9)bzs*F+&3lA>7kF0)UTaZljQ1+w9_W&|{f#Fhn9ua8>K?Ne`7nXh z%E$c*rO`g`bi!D%P13*E^lsnl7ups1wq<*;=tBt~h4+|)5=Cf9MV{zeJ%e?k82WoI{WR#PdZCGzfFKX45-=1uI%W09* z!lW;n9*lvDE-6>&m~JyYz2!8&;rSc)8&aQhovN27AVLADlhQy$ULuU1Wpo zwR-Ka=gn_dzA)1{5vYBlI^9>$tZ7x)JVM#rZ_;5xgmSfD0YbG_muXEycfVLEp{nYX zkx+^HiG6QuuXW}p7q!Z9gz(1J>$-enQT-w&bRr^j>xJ*+uf7L4L;ksJgw3sz^x<8- z$>excUBN0gvvR2@V_C^X_PiBuq(nUBbF1wC{BKvms=bL^+Iai2**;~}HfHg&WLstx zTs7JkzO&KU#NVjy_ux&ZUwuPM!upR_l;v5k$eQV0$|J%!aJ^Tego|Iqu4OL6$pLPI zdm9POx*eyw@nw2L10RQqe*GH4LdPiIa}KjoX7YhgJ`F~l-tmVV>`8>fZgq3?@$x6w zdv7JU`#QP&@gsrGy72@dxU&w2bqjFvQX4to(C(HN$C`%GYW#0TON$@zeO$;g4}NY( zR&QjuS;&zKe4GXAT;Z_zzX`VB2?}r{$73f3Fxo2azAn?i)lpe(z5+e2BB;DLb#w&= z2Ww21#9Gq@)|{@X#!nYmZMvo!H(g-WX_{*MbV-b#Hi@TAt5H59z)dE~hXnAUi1JJW zb8&yWnMRTnW78l|=2#j2P@Iqkq0U-BnPW=!czo94xGsCeQa+=vu?&pl2>O9Q0JA|I ze*r_idIXOu$*i?-^NHIx+$D1Bme+^cY2O->Ht}8;xKUddMe%ECmmbXnGNB(#5_q+PPW}TILo-$;-=+sa& zk6zrl*r)TWcS^L?r{w?6dseISnmu(hQ$l~G@BS4j>5Q)p+wl@rvf7N0Ltn98v*IN= zCpMaMhY~+y=M>7AX_My^DT6n0a>u3gDAW7DOzFYr%as7t!9UPp^d8}_f(P12U>-WW zpDPb26UxrlB=-Xo7GE7b4;NHYlwA;5+XWNA;#5_(I(q$$xFfb}Y@j%)?SkP_+b&33 zDH=P`c5zjx@))|ra~P|u$iMc*c-tA>omqfk-f+6FUb10!9^X21>4=bo9;^Mk-9*eV zFS-f{;hYlX@oGyFUI;{GYd#>GCjGrT9kT-(gM{_>isDnbqq772g;QA<207ww+g58` z&n28*aH`E@ZsDW*_YLj|E{RhVp0ghpZJpM>AxEj|t*>wEj{!Y7D)$PEU>s!agM`?i$i%2f=;vc-)=$rR#$U+uhv9t zojQHTW2a8v@z|-;cRY6L^c|0#I(^5ZdE0@Y@3<+{cia@}J8lZ~9XEygj+;V#$4#NW zqk6k10q@WJxtF9j6dzl-35DVd5NXr_I8?}a`OlT;|Dyo>zbpvKVYfKqf}k91Bg0S* zIgw!~2b#z*l*3CL+?wIB6EcRBDEo4>;!$?*Xv5>MDBriyil0{1 zL*0j>Rr+aFy}$Of7-sM>PX3yToIoCKoy(vv)OF{>*^6Z5_=*LCc+E@0 zLk&o`CsRGB5RlSH#AsI7#iL38kk#WOJb|JfAUCn2>M?$!Lba}x*11fd4OW;54$rW5 z{Fj{z**40Wjjg+-)(#RJk%l)JV~x=GX{45|mJ3EUn2VtQ+_XU!X%eeMvD@Q`P+c#j zbuLrkh9!bl`Hn6cGA4R*3`o^4D5i083`i9)C`OHE(n+ID)8XoZN7rUQJO;Q*7}Pwp zz;gid#*-7Et5ixO5e;XLPhy|#lo!pL4ODZ4=CMn&AGWBkbhr_~zjG&iyqq`Z3=*E_b zQtxJFF&J@SxYWBDp^g?Qok1-!UK|XK1H2vsUEvHsF5pGk?=Sl-5Ut1Q=&8yah(*bv?Dn^z)QHv^@y5Pio~K@d8k-!dYggHc6-)oRqDC#?2VN#iZt5QMr zA_a+svc9t`BFKfZeGfunp{(yMNGz1~eT@>r9HOl6g-9&A^c^aDq%;z7zA~&#G*+dK z2DVTs#n`|WszaqT67disjXIn~s7gf&XQ{=zqJk(M-9ao=j!0=Fq9hiHMtA5+4Lg7c zqdS13h7BOX=nmVcVXmr3(@1x)1{If48j1K64n}{C=j3MqHQo{jBP#-)?qh)(|CmNR z-6sV#ejT1R>FGW^sPXkQ;el&BLD$|~jTY%qa6hP}e@G)9Wz}1wPMh>7tKNMY@qntw WOs+x0V78F|JRy6mCeZKf#rz-t0Wf6% literal 0 HcmV?d00001 diff --git a/tests/darwin/dataset/remote_dataset_test.py b/tests/darwin/dataset/remote_dataset_test.py index b0cf10276..5af8d04ab 100644 --- a/tests/darwin/dataset/remote_dataset_test.py +++ b/tests/darwin/dataset/remote_dataset_test.py @@ -20,8 +20,17 @@ download_all_images_from_annotations, ) from darwin.dataset.release import Release, ReleaseStatus -from darwin.dataset.remote_dataset_v2 import RemoteDatasetV2 -from darwin.dataset.upload_manager import LocalFile, UploadHandlerV2 +from darwin.dataset.remote_dataset_v2 import ( + RemoteDatasetV2, + _find_files_to_upload_merging, + _find_files_to_upload_no_merging, +) +from darwin.dataset.upload_manager import ( + ItemMergeMode, + LocalFile, + MultiFileItem, + UploadHandlerV2, +) from darwin.datatypes import ManifestItem, ObjectStore, SegmentManifest from darwin.exceptions import UnsupportedExportFormat, UnsupportedFileType from darwin.item import DatasetItem @@ -601,6 +610,14 @@ def remote_dataset( @pytest.mark.usefixtures("file_read_write_test") class TestPush: + @pytest.fixture(scope="class") + def setup_zip(self): + zip_path = Path("tests/darwin/data/push_test_dir.zip") + with tempfile.TemporaryDirectory() as tmpdir: + with zipfile.ZipFile(zip_path, "r") as zip_ref: + zip_ref.extractall(tmpdir) + yield Path(tmpdir) + def test_raises_if_files_are_not_provided(self, remote_dataset: RemoteDataset): with pytest.raises(ValueError): remote_dataset.push(None) @@ -673,6 +690,131 @@ def test_raises_if_preserve_folders_with_item_merge_mode( preserve_folders=True, ) + def test_find_files_to_upload_merging_slots(self, setup_zip): + base_path = setup_zip / "push_test_dir" / "dir1" + search_files = [base_path / "item1", base_path / "item2"] + multi_file_items = _find_files_to_upload_merging(search_files, [], "slots") + assert len(multi_file_items) == 2 + assert all(isinstance(item, MultiFileItem) for item in multi_file_items) + + def test_find_files_to_upload_merging_series(self, setup_zip): + base_path = setup_zip / "push_test_dir" / "dir1" + search_files = [base_path / "dicoms"] + multi_file_items = _find_files_to_upload_merging(search_files, [], "series") + assert len(multi_file_items) == 1 + assert all(isinstance(item, MultiFileItem) for item in multi_file_items) + + def test_find_files_to_upload_merging_channels(self, setup_zip): + base_path = setup_zip / "push_test_dir" / "dir1" + search_files = [base_path / "item1", base_path / "item2"] + multi_file_items = _find_files_to_upload_merging(search_files, [], "slots") + assert len(multi_file_items) == 2 + + def test_find_files_to_upload_merging_does_not_search_recursively(self, setup_zip): + base_path = setup_zip / "push_test_dir" / "dir2" + search_files = [base_path / "recursive_search"] + multi_file_items = _find_files_to_upload_merging(search_files, [], "slots") + assert len(multi_file_items) == 1 + assert len(multi_file_items[0].files) == 2 + + def test_find_files_to_upload_no_merging_searches_recursively(self, setup_zip): + base_path = setup_zip / "push_test_dir" / "dir2" + search_files = [base_path / "recursive_search"] + local_files = _find_files_to_upload_no_merging( + search_files, + [], + None, + 0, + False, + False, + False, + [], + ) + assert len(local_files) == 11 + assert all(isinstance(file, LocalFile) for file in local_files) + + def test_find_files_to_upload_no_merging_no_files(self, setup_zip): + base_path = setup_zip / "push_test_dir" / "dir2" + search_files = [base_path / "no_files1", base_path / "no_files2"] + with pytest.raises( + ValueError, + match="No files to upload, check your path, exclusion filters and resume flag", + ): + _find_files_to_upload_no_merging( + search_files, + [], + None, + 0, + False, + False, + False, + [], + ) + + +class TestMultiFileItem: + @pytest.fixture(scope="class") + def setup_zip(self): + zip_path = Path("tests/darwin/data/push_test_dir.zip") + with tempfile.TemporaryDirectory() as tmpdir: + with zipfile.ZipFile(zip_path, "r") as zip_ref: + zip_ref.extractall(tmpdir) + yield Path(tmpdir) + + def test_create_multi_file_item_slots(self, setup_zip): + base_path = setup_zip / "push_test_dir" / "dir1" / "item1" + files = list(base_path.glob("*")) + item = MultiFileItem(base_path, files, merge_mode=ItemMergeMode.SLOTS) + assert len(item.files) == 7 + assert item.name == "item1" + assert item.layout == { + "version": 2, + "slots": ["0", "1", "2", "3", "4", "5", "6"], + "type": "grid", + } + + def test_create_multi_file_item_series(self, setup_zip): + base_path = setup_zip / "push_test_dir" / "dir1" / "dicoms" + files = list(base_path.glob("*")) + item = MultiFileItem(base_path, files, merge_mode=ItemMergeMode.SERIES) + assert len(item.files) == 6 + assert item.name == "dicoms" + assert item.layout == { + "version": 2, + "slots": ["0", "1", "2", "3", "4", "5"], + "type": "grid", + } + + def test_create_multi_file_item_channels(self, setup_zip): + base_path = setup_zip / "push_test_dir" / "dir1" / "item1" + files = list(base_path.glob("*")) + item = MultiFileItem(base_path, files, merge_mode=ItemMergeMode.CHANNELS) + assert len(item.files) == 7 + assert item.name == "item1" + assert item.layout == { + "version": 3, + "slots_grid": [ + [["4.jpg", "5.JPG", "7.JPG", "6.jpg", "3.JPG", "1.JPG", "2"]] + ], + } + + def test_create_series_no_valid_files(self, setup_zip): + base_path = setup_zip / "push_test_dir" / "dir1" / "item1" + files = list(base_path.glob("*")) + with pytest.raises( + ValueError, match="No DICOM files found in 1st level of directory" + ): + MultiFileItem(base_path, files, merge_mode=ItemMergeMode.SERIES) + + def test_create_channels_too_many_files(self, setup_zip): + base_path = setup_zip / "push_test_dir" / "dir2" / "too_many_channels" + files = list(base_path.glob("*")) + with pytest.raises( + ValueError, + match=f"No multi-channel item can have more than 16 channels. The following directory has 17 files: {base_path}", + ): + MultiFileItem(base_path, files, merge_mode=ItemMergeMode.CHANNELS) + @pytest.mark.usefixtures("file_read_write_test") class TestPull: From bd325172817ab76de91d12b35f38d44a9544ccdd Mon Sep 17 00:00:00 2001 From: John Wilkie Date: Fri, 30 Aug 2024 14:13:01 +0100 Subject: [PATCH 3/5] Cleaned up tests --- tests/darwin/data/push_test_dir.zip | Bin 24374 -> 14762 bytes tests/darwin/dataset/remote_dataset_test.py | 28 +++++++++----------- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/tests/darwin/data/push_test_dir.zip b/tests/darwin/data/push_test_dir.zip index 2a09ce77336d274582539635428692163674766f..6d76d817411da1608ab0374474f13321e222aaed 100644 GIT binary patch literal 14762 zcmc(lO>7%Q6vx;3AS9)wr7bjteo)#Nj;nIZ}SQh ze-%8hZ(||T*6x(8$XI~czGM&w}md>lQQwx6@KJ@j=evMHbUOtzDOaFl-Ety*QX z)n2(?3);2CR%O}3`R_D!=aU5CS63>H^>(mWuLi5l+LfX2CvW$brlUu`X8I>dn58rb zKVepyv@*M&HamFZ1BZ-wyEERS(SG~x#sO{bouurR{B$&)DXuoJ^!Me5U)<@3BN>uY z67t`Y`_z$pw)pzOc|Pv*s9rJ8D;NjAn3v+0@&cdE%hjd4z;k)ITI2;jm6NMWc_}XC zrZ~5#X3i1#k;$AS;H1dxX_zY>b~6nM@MKB)0-Y37bys`y-ME^T>AyFsGIb*j>(mh> zy1+OAYGTf|D3&<^kdZ^wwc=ExD5mU^!0`#+G-a#mqc>|H17}X76zF`_!%)w)1F-MG%#e8)% zvbHidm~SnIscJ}VcAvW{7(o-fN%@db8*>FCj@N*44;Xh>P6~^|AXU`hdcp@yzzicOu z*|z2EQLV82X3BQlJ5ggz`I?$6p@6R_05X}30m=4^oK30~mj9pt)NH7Kua{k#%t>X8 zq-K>d@~uMh1L-@>YH}H~sM%%Ax;cV{Ofq8@HOq`y(_^HrRKDwb5{!=Ie2`!AxjMFVc+1*jA+xdv% zP|*+OCzSgD<$_J`Y_9CDtJUqSO%Q&i^rDkd6#P3326G__zC4L42D2av{uKs;`40sz zJc23)+b<~iCm0M~vQhA>Q>bF_#*Kn+!C-Jsh=S*%Z(!&(O+SUEztMqCMp5uh7|g8r z_%y1RIjX-#!NNI5=x6o58BnosP7-=j{2T>4ds^pCZ$kxRlhL@?NS45I7OGZQ?$F#+ z-G)y_QSx0YGBPR6-p8^zu2#2m@iEfZDb3E=SjYYhVx1F&9$DwM6KYjC`M)|D?U81C zS?AtDxnQ?6o2$EdZU?3p?3-p|b-CYwSiyX79C;j*i%w3n_3Cc^66As%V-&kuR|Pt)XQnyV0&>WwR=4x=lO}D>6@U)=)(CUXgF0;XDU&ki?5D$S7-7yi zPY$fq>UQo^F!#x6_ED+hTl)yd8t~4ir;h*A67PJV>iFiYoqFdpR>$A7!o!nO^s}$; uw^1*A=Pa$O4=wS`p||$5oqFcbyK9LDLvJ!_%zEA{(a*^Pp11mp{O?~z7hfX) literal 24374 zcmd^{c|4Tg_rOQi5E3GL*|#xcPo*qvh$2a8EZNs=5tXsEpv6av6sakNh*Gj;D$G#j{R97)^DPOeyAh~vpZ3kgsO8!! zj?RFFS{9GjLUZv#rpqnT0&LFn@wD-zyFDs-*Wgmm+BGu{+Y}od{+RPgP@J&FgTWK` zwtx9?LrGsV(;_WbVE>uVcH%TCG;!_wu`_Zk&6*kG`96anR0W zuPdUxHsT?3X;j*p7L5@ z+3CR@h=|A#jwv^DyYA&5R}Y1GIC=UUWxo9TAf{C&wI$T|u;Lt#s@S+6;tX5| zZbpln8?&-4mU^HRJcF#=WECD!ZfI|C?4#Z&b0#P^d9&b0c1z#MI_D~~azmeSiVbzh zPTngROim1Shujl7#0~F~(4;zN=mXmsE7a3}+URI|Kt_uj8EsAq;%G!eihmr>2(h`RR7w2jz8egcGV_6$56CKfbem>tm2KhOQn7&rl zmibG3V>#K*xcRB_d4r2hyY3D){?r@jP~XOhIisU64}-%P+6j7xTg*Np&s-&{)kUZ~+qbtb{GAO!nnV-ekLMee>LQ zXvgMtXNklzjNP&07d@D)SF+?qClE}R8JJmEW2w)$Vck_Bz^{MZ#9i!Y$M5K3=U~@m zYGT#JZMD}te8wR`Mp+(VhFoEu$bSB%wHS+LiL!7;?U*R0Wl8+1wr!9tVP)gkTc)i+ z9t$*{$xQ(tyn51M4`|%Je=Gyg_8o5+rQiqe=j0gZ=kFHeMDTaA_j7awvXp#eOF7Q3 zki%eLcP|bLzj}~^G0p~e+Sq#Xt>;_f;%e_4?zhP<{M_f&MyC@~zZABLU+)x%DfJmjuh%3-xW0+&uBi!^)ZBm++kFI@5nab9sx|6IS*53*YWW zv6iU$A2YfB$|{7I_#@%d%g~29+aHvC)b;QC!iTvmFZ-O4i6NYIL&8RL-bF>X!yWUE zZdp0V@W@Vv^}fkEwS;rk|29|d;TB&K*64g%u|Mg$7AN#Q@p3BnGMTd`Tt&P|DUNxy z?+lJ};)R_2UL3j8p)GUo`1>R&6PBbnyiUjJ-a7rD{-r@KTV?r%fA%{^exPgHhVmsv zW~@S%D~W3ZcHKN)>zd(q$##I>?{Q5{`okudF0~&kbXnrcxfJbNYtI|@hQI!-&Z)ey z|K!Rq7Dvjplb(CFS?NDLlJ)L%4ZFstcB{abf}VK4LQkXK{gBf}^o$sG7`L@8uMr%z``CKs;)go7`hjSmU*0iumSibKT z`=y(u3!F%5*>5ZI9CkMBz4gMcYtv1ULkA4?+(bMUn7RKW)b&P|?bgq*6;{tVOT-&p z6u*6z;F?+IW+oyQ$thQ-7yU^6lS#?N8+A>bGlZdB6H` z=T66d9a7$EzR*vPJ|Ep-y*|%2GQ)FLhuLv?qhsq!ZtCWoa9?M!u*c9w@*|Udlx1y_ zvhpE;Z={&7ccq`6)p@$3QOJE;zEtmB^%zZqYpnm=Wi4)Ujd+?6Q1(pSh5zS<4)iV(`az<=w(0`;CMzQ&ZN9)=f{^*v*!_h!!8Xf(kY(-X zv{kDgCmlaxU0rzH^9U)j_tbV-lg!0u+tf1@J9AmJo_qvq%J$Gc=T zEFFvT?_@Y_z4$0;NB)!B-2YbyQR%u1U=N@Z-?vu|FB z(l-rIzQ@WPYo{@I^akTkxx;v0-Zhm%+8iI74GLzYR($lGRk}yKX+{6cUD0oww)Zps zUS7p6Wxa2Gf#9)bzs*F+&3lA>7kF0)UTaZljQ1+w9_W&|{f#Fhn9ua8>K?Ne`7nXh z%E$c*rO`g`bi!D%P13*E^lsnl7ups1wq<*;=tBt~h4+|)5=Cf9MV{zeJ%e?k82WoI{WR#PdZCGzfFKX45-=1uI%W09* z!lW;n9*lvDE-6>&m~JyYz2!8&;rSc)8&aQhovN27AVLADlhQy$ULuU1Wpo zwR-Ka=gn_dzA)1{5vYBlI^9>$tZ7x)JVM#rZ_;5xgmSfD0YbG_muXEycfVLEp{nYX zkx+^HiG6QuuXW}p7q!Z9gz(1J>$-enQT-w&bRr^j>xJ*+uf7L4L;ksJgw3sz^x<8- z$>excUBN0gvvR2@V_C^X_PiBuq(nUBbF1wC{BKvms=bL^+Iai2**;~}HfHg&WLstx zTs7JkzO&KU#NVjy_ux&ZUwuPM!upR_l;v5k$eQV0$|J%!aJ^Tego|Iqu4OL6$pLPI zdm9POx*eyw@nw2L10RQqe*GH4LdPiIa}KjoX7YhgJ`F~l-tmVV>`8>fZgq3?@$x6w zdv7JU`#QP&@gsrGy72@dxU&w2bqjFvQX4to(C(HN$C`%GYW#0TON$@zeO$;g4}NY( zR&QjuS;&zKe4GXAT;Z_zzX`VB2?}r{$73f3Fxo2azAn?i)lpe(z5+e2BB;DLb#w&= z2Ww21#9Gq@)|{@X#!nYmZMvo!H(g-WX_{*MbV-b#Hi@TAt5H59z)dE~hXnAUi1JJW zb8&yWnMRTnW78l|=2#j2P@Iqkq0U-BnPW=!czo94xGsCeQa+=vu?&pl2>O9Q0JA|I ze*r_idIXOu$*i?-^NHIx+$D1Bme+^cY2O->Ht}8;xKUddMe%ECmmbXnGNB(#5_q+PPW}TILo-$;-=+sa& zk6zrl*r)TWcS^L?r{w?6dseISnmu(hQ$l~G@BS4j>5Q)p+wl@rvf7N0Ltn98v*IN= zCpMaMhY~+y=M>7AX_My^DT6n0a>u3gDAW7DOzFYr%as7t!9UPp^d8}_f(P12U>-WW zpDPb26UxrlB=-Xo7GE7b4;NHYlwA;5+XWNA;#5_(I(q$$xFfb}Y@j%)?SkP_+b&33 zDH=P`c5zjx@))|ra~P|u$iMc*c-tA>omqfk-f+6FUb10!9^X21>4=bo9;^Mk-9*eV zFS-f{;hYlX@oGyFUI;{GYd#>GCjGrT9kT-(gM{_>isDnbqq772g;QA<207ww+g58` z&n28*aH`E@ZsDW*_YLj|E{RhVp0ghpZJpM>AxEj|t*>wEj{!Y7D)$PEU>s!agM`?i$i%2f=;vc-)=$rR#$U+uhv9t zojQHTW2a8v@z|-;cRY6L^c|0#I(^5ZdE0@Y@3<+{cia@}J8lZ~9XEygj+;V#$4#NW zqk6k10q@WJxtF9j6dzl-35DVd5NXr_I8?}a`OlT;|Dyo>zbpvKVYfKqf}k91Bg0S* zIgw!~2b#z*l*3CL+?wIB6EcRBDEo4>;!$?*Xv5>MDBriyil0{1 zL*0j>Rr+aFy}$Of7-sM>PX3yToIoCKoy(vv)OF{>*^6Z5_=*LCc+E@0 zLk&o`CsRGB5RlSH#AsI7#iL38kk#WOJb|JfAUCn2>M?$!Lba}x*11fd4OW;54$rW5 z{Fj{z**40Wjjg+-)(#RJk%l)JV~x=GX{45|mJ3EUn2VtQ+_XU!X%eeMvD@Q`P+c#j zbuLrkh9!bl`Hn6cGA4R*3`o^4D5i083`i9)C`OHE(n+ID)8XoZN7rUQJO;Q*7}Pwp zz;gid#*-7Et5ixO5e;XLPhy|#lo!pL4ODZ4=CMn&AGWBkbhr_~zjG&iyqq`Z3=*E_b zQtxJFF&J@SxYWBDp^g?Qok1-!UK|XK1H2vsUEvHsF5pGk?=Sl-5Ut1Q=&8yah(*bv?Dn^z)QHv^@y5Pio~K@d8k-!dYggHc6-)oRqDC#?2VN#iZt5QMr zA_a+svc9t`BFKfZeGfunp{(yMNGz1~eT@>r9HOl6g-9&A^c^aDq%;z7zA~&#G*+dK z2DVTs#n`|WszaqT67disjXIn~s7gf&XQ{=zqJk(M-9ao=j!0=Fq9hiHMtA5+4Lg7c zqdS13h7BOX=nmVcVXmr3(@1x)1{If48j1K64n}{C=j3MqHQo{jBP#-)?qh)(|CmNR z-6sV#ejT1R>FGW^sPXkQ;el&BLD$|~jTY%qa6hP}e@G)9Wz}1wPMh>7tKNMY@qntw WOs+x0V78F|JRy6mCeZKf#rz-t0Wf6% diff --git a/tests/darwin/dataset/remote_dataset_test.py b/tests/darwin/dataset/remote_dataset_test.py index 5af8d04ab..897364580 100644 --- a/tests/darwin/dataset/remote_dataset_test.py +++ b/tests/darwin/dataset/remote_dataset_test.py @@ -692,7 +692,7 @@ def test_raises_if_preserve_folders_with_item_merge_mode( def test_find_files_to_upload_merging_slots(self, setup_zip): base_path = setup_zip / "push_test_dir" / "dir1" - search_files = [base_path / "item1", base_path / "item2"] + search_files = [base_path / "jpegs", base_path / "dicoms"] multi_file_items = _find_files_to_upload_merging(search_files, [], "slots") assert len(multi_file_items) == 2 assert all(isinstance(item, MultiFileItem) for item in multi_file_items) @@ -706,8 +706,8 @@ def test_find_files_to_upload_merging_series(self, setup_zip): def test_find_files_to_upload_merging_channels(self, setup_zip): base_path = setup_zip / "push_test_dir" / "dir1" - search_files = [base_path / "item1", base_path / "item2"] - multi_file_items = _find_files_to_upload_merging(search_files, [], "slots") + search_files = [base_path / "jpegs", base_path / "dicoms"] + multi_file_items = _find_files_to_upload_merging(search_files, [], "channels") assert len(multi_file_items) == 2 def test_find_files_to_upload_merging_does_not_search_recursively(self, setup_zip): @@ -735,7 +735,7 @@ def test_find_files_to_upload_no_merging_searches_recursively(self, setup_zip): def test_find_files_to_upload_no_merging_no_files(self, setup_zip): base_path = setup_zip / "push_test_dir" / "dir2" - search_files = [base_path / "no_files1", base_path / "no_files2"] + search_files = [base_path / "no_files_1", base_path / "no_files_2"] with pytest.raises( ValueError, match="No files to upload, check your path, exclusion filters and resume flag", @@ -762,14 +762,14 @@ def setup_zip(self): yield Path(tmpdir) def test_create_multi_file_item_slots(self, setup_zip): - base_path = setup_zip / "push_test_dir" / "dir1" / "item1" + base_path = setup_zip / "push_test_dir" / "dir1" / "jpegs" files = list(base_path.glob("*")) item = MultiFileItem(base_path, files, merge_mode=ItemMergeMode.SLOTS) - assert len(item.files) == 7 - assert item.name == "item1" + assert len(item.files) == 6 + assert item.name == "jpegs" assert item.layout == { "version": 2, - "slots": ["0", "1", "2", "3", "4", "5", "6"], + "slots": ["0", "1", "2", "3", "4", "5"], "type": "grid", } @@ -786,20 +786,18 @@ def test_create_multi_file_item_series(self, setup_zip): } def test_create_multi_file_item_channels(self, setup_zip): - base_path = setup_zip / "push_test_dir" / "dir1" / "item1" + base_path = setup_zip / "push_test_dir" / "dir1" / "jpegs" files = list(base_path.glob("*")) item = MultiFileItem(base_path, files, merge_mode=ItemMergeMode.CHANNELS) - assert len(item.files) == 7 - assert item.name == "item1" + assert len(item.files) == 6 + assert item.name == "jpegs" assert item.layout == { "version": 3, - "slots_grid": [ - [["4.jpg", "5.JPG", "7.JPG", "6.jpg", "3.JPG", "1.JPG", "2"]] - ], + "slots_grid": [[["4.jpg", "5.JPG", "7.JPG", "6.jpg", "3.JPG", "1.JPG"]]], } def test_create_series_no_valid_files(self, setup_zip): - base_path = setup_zip / "push_test_dir" / "dir1" / "item1" + base_path = setup_zip / "push_test_dir" / "dir1" / "jpegs" files = list(base_path.glob("*")) with pytest.raises( ValueError, match="No DICOM files found in 1st level of directory" From a123c88f724b6e12dd0541e4c1ab4f10779f96c6 Mon Sep 17 00:00:00 2001 From: John Wilkie Date: Fri, 30 Aug 2024 14:33:11 +0100 Subject: [PATCH 4/5] Added natural sorting to the order of files in multi-file items --- darwin/dataset/remote_dataset_v2.py | 7 ++++++- darwin/utils/utils.py | 13 ++++++++++--- poetry.lock | 17 ++++++++++++++++- pyproject.toml | 1 + tests/darwin/dataset/remote_dataset_test.py | 13 +++++++------ 5 files changed, 40 insertions(+), 11 deletions(-) diff --git a/darwin/dataset/remote_dataset_v2.py b/darwin/dataset/remote_dataset_v2.py index eecc725d9..063b402d6 100644 --- a/darwin/dataset/remote_dataset_v2.py +++ b/darwin/dataset/remote_dataset_v2.py @@ -884,7 +884,12 @@ def _find_files_to_upload_merging( multi_file_items = [] for directory in search_files: files_in_directory = list( - find_files([directory], files_to_exclude=files_to_exclude, recursive=False) + find_files( + [directory], + files_to_exclude=files_to_exclude, + recursive=False, + sort=True, + ) ) if not files_in_directory: print( diff --git a/darwin/utils/utils.py b/darwin/utils/utils.py index 5f57449ad..f2cbd8d85 100644 --- a/darwin/utils/utils.py +++ b/darwin/utils/utils.py @@ -25,6 +25,7 @@ import requests from json_stream.base import PersistentStreamingJSONList, PersistentStreamingJSONObject from jsonschema import validators +from natsort import natsorted from requests import Response from rich.progress import ProgressType, track from upolygon import draw_polygon @@ -216,6 +217,7 @@ def find_files( *, files_to_exclude: List[dt.PathLike] = [], recursive: bool = True, + sort: bool = False, ) -> List[Path]: """ Retrieve a list of all files belonging to supported extensions. The exploration can be made @@ -229,7 +231,8 @@ def find_files( List of files to exclude from the search. recursive : bool Flag for recursive search. - + sort : bool + Flag for sorting the files naturally, i.e. file2.txt will come before file10.txt. Returns ------- List[Path] @@ -255,8 +258,12 @@ def find_files( raise UnsupportedFileType(path) files_to_exclude_full_paths = [str(Path(f)) for f in files_to_exclude] - - return [f for f in found_files if str(f) not in files_to_exclude_full_paths] + filtered_files = [ + f for f in found_files if str(f) not in files_to_exclude_full_paths + ] + if sort: + return natsorted(filtered_files) + return filtered_files def secure_continue_request() -> bool: diff --git a/poetry.lock b/poetry.lock index 9161c0ff2..a850e8c4c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -913,6 +913,21 @@ files = [ {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, ] +[[package]] +name = "natsort" +version = "8.4.0" +description = "Simple yet flexible natural sorting in Python." +optional = false +python-versions = ">=3.7" +files = [ + {file = "natsort-8.4.0-py3-none-any.whl", hash = "sha256:4732914fb471f56b5cce04d7bae6f164a592c7712e1c85f9ef585e197299521c"}, + {file = "natsort-8.4.0.tar.gz", hash = "sha256:45312c4a0e5507593da193dedd04abb1469253b601ecaf63445ad80f0a1ea581"}, +] + +[package.extras] +fast = ["fastnumbers (>=2.0.0)"] +icu = ["PyICU (>=1.0.0)"] + [[package]] name = "networkx" version = "3.1" @@ -2238,4 +2253,4 @@ test = ["pytest", "responses"] [metadata] lock-version = "2.0" python-versions = ">=3.8.0,<3.12" -content-hash = "6e6c0628c98652df5dd76a8d82a0f67af9ec2037388350412152d21d84fa9d57" +content-hash = "3ea848bf4d0e5e0f22170f20321ce5d426eb79c6bc0a536b36519fd6f7c6782e" diff --git a/pyproject.toml b/pyproject.toml index 10c69fc3e..ef4146782 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -83,6 +83,7 @@ types-requests = "^2.28.11.8" upolygon = "0.1.11" tenacity = "8.5.0" +natsort = "^8.4.0" [tool.poetry.extras] dev = ["black", "isort", "flake8", "mypy", "debugpy", "responses", "pytest", "flake8-pyproject", "pytest-rerunfailures", "ruff", "validate-pyproject"] medical = ["nibabel", "connected-components-3d", "scipy"] diff --git a/tests/darwin/dataset/remote_dataset_test.py b/tests/darwin/dataset/remote_dataset_test.py index 897364580..0c69b054c 100644 --- a/tests/darwin/dataset/remote_dataset_test.py +++ b/tests/darwin/dataset/remote_dataset_test.py @@ -10,6 +10,7 @@ import orjson as json import pytest import responses +from natsort import natsorted from pydantic import ValidationError from darwin.client import Client @@ -763,7 +764,7 @@ def setup_zip(self): def test_create_multi_file_item_slots(self, setup_zip): base_path = setup_zip / "push_test_dir" / "dir1" / "jpegs" - files = list(base_path.glob("*")) + files = natsorted(list(base_path.glob("*"))) item = MultiFileItem(base_path, files, merge_mode=ItemMergeMode.SLOTS) assert len(item.files) == 6 assert item.name == "jpegs" @@ -775,7 +776,7 @@ def test_create_multi_file_item_slots(self, setup_zip): def test_create_multi_file_item_series(self, setup_zip): base_path = setup_zip / "push_test_dir" / "dir1" / "dicoms" - files = list(base_path.glob("*")) + files = natsorted(list(base_path.glob("*"))) item = MultiFileItem(base_path, files, merge_mode=ItemMergeMode.SERIES) assert len(item.files) == 6 assert item.name == "dicoms" @@ -787,18 +788,18 @@ def test_create_multi_file_item_series(self, setup_zip): def test_create_multi_file_item_channels(self, setup_zip): base_path = setup_zip / "push_test_dir" / "dir1" / "jpegs" - files = list(base_path.glob("*")) + files = natsorted(list(base_path.glob("*"))) item = MultiFileItem(base_path, files, merge_mode=ItemMergeMode.CHANNELS) assert len(item.files) == 6 assert item.name == "jpegs" assert item.layout == { "version": 3, - "slots_grid": [[["4.jpg", "5.JPG", "7.JPG", "6.jpg", "3.JPG", "1.JPG"]]], + "slots_grid": [[["1.JPG", "3.JPG", "4.jpg", "5.JPG", "6.jpg", "7.JPG"]]], } def test_create_series_no_valid_files(self, setup_zip): base_path = setup_zip / "push_test_dir" / "dir1" / "jpegs" - files = list(base_path.glob("*")) + files = natsorted(list(base_path.glob("*"))) with pytest.raises( ValueError, match="No DICOM files found in 1st level of directory" ): @@ -806,7 +807,7 @@ def test_create_series_no_valid_files(self, setup_zip): def test_create_channels_too_many_files(self, setup_zip): base_path = setup_zip / "push_test_dir" / "dir2" / "too_many_channels" - files = list(base_path.glob("*")) + files = natsorted(list(base_path.glob("*"))) with pytest.raises( ValueError, match=f"No multi-channel item can have more than 16 channels. The following directory has 17 files: {base_path}", From 3c678b650f4608f2a58f628a407d9468606c698c Mon Sep 17 00:00:00 2001 From: John Wilkie Date: Fri, 30 Aug 2024 16:14:50 +0100 Subject: [PATCH 5/5] Fix for failing test on Windows --- darwin/cli_functions.py | 4 ++-- darwin/dataset/remote_dataset_v2.py | 15 ++++++++------- darwin/dataset/upload_manager.py | 10 +++++----- darwin/options.py | 2 +- tests/darwin/dataset/remote_dataset_test.py | 4 ++-- 5 files changed, 18 insertions(+), 17 deletions(-) diff --git a/darwin/cli_functions.py b/darwin/cli_functions.py index ce00f6a54..9f67fc1b7 100644 --- a/darwin/cli_functions.py +++ b/darwin/cli_functions.py @@ -687,8 +687,8 @@ def upload_data( Specify whether to have full traces print when uploading files or not. item_merge_mode : Optional[str] If set, each file path passed to `files_to_upload` behaves as follows: - - Every path that points directly to a file is ignored - - Each folder of files passed to `files_to_upload` will be uploaded according to the following mode rules. + - Paths pointing directly to individual files are ignored + - Paths pointing to folders of files will be uploaded according to the following mode rules. Note that folders will not be recursively searched, so only files in the first level of the folder will be uploaded: - "slots": Each file in the folder will be uploaded to a different slot of the same item. - "series": All `.dcm` files in the folder will be concatenated into a single slot. All other files are ignored. diff --git a/darwin/dataset/remote_dataset_v2.py b/darwin/dataset/remote_dataset_v2.py index 063b402d6..fba60c8ff 100644 --- a/darwin/dataset/remote_dataset_v2.py +++ b/darwin/dataset/remote_dataset_v2.py @@ -202,19 +202,18 @@ def push( Optional callback, called every time the progress of an uploading files is reported. file_upload_callback: Optional[FileUploadCallback], default: None Optional callback, called every time a file chunk is uploaded. - item_merge_mode: Optional[str], default: None + item_merge_mode : Optional[str] If set, each file path passed to `files_to_upload` behaves as follows: - - Every path that points directly to a file is ignored - - Each folder of files passed to `files_to_upload` will be uploaded according to the following mode rules. - Note that folders will not be recursively searched, so only files in the first level of the folder will be uploaded: + - Paths pointing directly to individual files are ignored + - Paths pointing to folders of files will be uploaded according to the below mode rules. + Note that folders will not be recursively searched, so only files in the first level of the folder will be uploaded: - "slots": Each file in the folder will be uploaded to a different slot of the same item. - "series": All `.dcm` files in the folder will be concatenated into a single slot. All other files are ignored. - "channels": Each file in the folder will be uploaded to a different channel of the same item. - Returns ------- handler : UploadHandler - Class for handling uploads, progress and error messages. + Class for handling uploads, progress and error messages. Raises ------ @@ -238,7 +237,9 @@ def push( ) if item_merge_mode and preserve_folders: - raise TypeError("Cannot use `preserve_folders` with `item_merge_mode`") + raise TypeError( + "`item_merge_mode` does not support preserving local file structures with `preserve_folders` or `--folders`" + ) # Direct file paths uploading_files = [ diff --git a/darwin/dataset/upload_manager.py b/darwin/dataset/upload_manager.py index 855d9e415..7ba3fcd86 100644 --- a/darwin/dataset/upload_manager.py +++ b/darwin/dataset/upload_manager.py @@ -204,7 +204,7 @@ def __init__(self, directory: Path, files: List[Path], merge_mode: ItemMergeMode def _create_layout(self): """ - Creates the layout to be used when uploading the files: + Creates the layout to be used when uploading the files as a dataset item: - For multi-slotted items: LayoutV2 - For series items: LayoutV2, but only with `.dcm` files - For multi-channel items: LayoutV3 @@ -212,8 +212,8 @@ def _create_layout(self): Raises ------ ValueError - - If no DICOM files are found in the directory for ItemMergeMode.SEIRES items - - If the number of files is greater than 16 for ItemMergeMode.CHANNELS items + - If no DICOM files are found in the directory for `ItemMergeMode.SERIES` items + - If the number of files is greater than 16 for `ItemMergeMode.CHANNELS` items """ if self.merge_mode == ItemMergeMode.SLOTS: return { @@ -224,7 +224,7 @@ def _create_layout(self): elif self.merge_mode == ItemMergeMode.SERIES: self.files = [file for file in self.files if file.suffix.lower() == ".dcm"] if not self.files: - raise ValueError("No DICOM files found in 1st level of directory") + raise ValueError("No `.dcm` files found in 1st level of directory") return { "version": 2, "slots": [str(i) for i in range(len(self.files))], @@ -233,7 +233,7 @@ def _create_layout(self): elif self.merge_mode == ItemMergeMode.CHANNELS: if len(self.files) > 16: raise ValueError( - f"No multi-channel item can have more than 16 channels. The following directory has {len(self.files)} files: {self.directory}" + f"No multi-channel item can have more than 16 files. The following directory has {len(self.files)} files: {self.directory}" ) return {"version": 3, "slots_grid": [[[file.name for file in self.files]]]} diff --git a/darwin/options.py b/darwin/options.py index 996c27b3a..6ac6c6717 100644 --- a/darwin/options.py +++ b/darwin/options.py @@ -187,7 +187,7 @@ def __init__(self) -> None: "--item-merge-mode", type=str, choices=["slots", "series", "channels"], - help="Specify the item merge mode: slots, series, or channels", + help="Specify the item merge mode: `slots`, `series`, or `channels`", ) # Remove diff --git a/tests/darwin/dataset/remote_dataset_test.py b/tests/darwin/dataset/remote_dataset_test.py index 0c69b054c..72dea3d65 100644 --- a/tests/darwin/dataset/remote_dataset_test.py +++ b/tests/darwin/dataset/remote_dataset_test.py @@ -801,7 +801,7 @@ def test_create_series_no_valid_files(self, setup_zip): base_path = setup_zip / "push_test_dir" / "dir1" / "jpegs" files = natsorted(list(base_path.glob("*"))) with pytest.raises( - ValueError, match="No DICOM files found in 1st level of directory" + ValueError, match="No `.dcm` files found in 1st level of directory" ): MultiFileItem(base_path, files, merge_mode=ItemMergeMode.SERIES) @@ -810,7 +810,7 @@ def test_create_channels_too_many_files(self, setup_zip): files = natsorted(list(base_path.glob("*"))) with pytest.raises( ValueError, - match=f"No multi-channel item can have more than 16 channels. The following directory has 17 files: {base_path}", + match=r"No multi-channel item can have more than 16 files. The following directory has 17 files: .*", ): MultiFileItem(base_path, files, merge_mode=ItemMergeMode.CHANNELS)