From 87fcae9263a039836fc5ed558de2d091e6dca79b Mon Sep 17 00:00:00 2001 From: Stuart Mumford Date: Mon, 27 Jan 2025 12:32:57 +0000 Subject: [PATCH] Oh god what have I done --- dkist/dataset/loader.py | 47 ++++++++++++++++- dkist/dataset/tests/test_load_dataset.py | 66 ++++++++++++++---------- 2 files changed, 86 insertions(+), 27 deletions(-) diff --git a/dkist/dataset/loader.py b/dkist/dataset/loader.py index a7098718..5dde86fc 100644 --- a/dkist/dataset/loader.py +++ b/dkist/dataset/loader.py @@ -1,6 +1,8 @@ +import re import importlib.resources as importlib_resources from pathlib import Path from functools import singledispatch +from collections import defaultdict from parfive import Results @@ -14,6 +16,9 @@ from asdf import ValidationError +ASDF_FILENAME_PATTERN = r"^(?P[A-Z-]+)_L1_(?P\d{8}T\d{6})_(?P[A-Z]+)(?P_user_tools|_metadata)?.asdf$" + + def asdf_open_memory_mapping_kwarg(memmap: bool) -> dict: if asdf.__version__ > "3.1.0": return {"memmap": memmap} @@ -172,7 +177,47 @@ def _load_from_directory(directory): if len(asdf_files) == 1: return _load_from_asdf(asdf_files[0]) - return _load_from_iterable(asdf_files) + pattern = re.compile(ASDF_FILENAME_PATTERN) + candidates = [] + asdfs_to_load = [] + for filepath in asdf_files: + filename = filepath.name + + # If the asdf file doesn't match the data center pattern then we load it + # as it's probably a custom user file + if pattern.match(filename) is None: + asdfs_to_load.append(filepath) + continue + + # All the matches have to be checked + candidates.append(filepath) + + # If we only have one match load it + if len(candidates) == 1: + asdfs_to_load += candidates + else: + # Now we group by prefix + matches = [pattern.match(fp.name) for fp in candidates] + grouped = defaultdict(list) + for m in matches: + prefix = m.string.removesuffix(".asdf").removesuffix(m.group("suffix") or "") + grouped[prefix].append(m.group("suffix")) + + for prefix, suffixes in grouped.items(): + if "_metadata" in suffixes: + asdfs_to_load.append(base_path / f"{prefix}_metadata.asdf") + elif "_user_tools" in suffixes: + asdfs_to_load.append(base_path / f"{prefix}_user_tools.asdf") + elif None in suffixes: + asdfs_to_load.append(base_path / f"{prefix}.asdf") + else: + raise ValueError("How did you end up here?") + + if len(asdfs_to_load) == 1: + return _load_from_asdf(asdfs_to_load[0]) + + # Ensure we load in the same order we were passed the files + return _load_from_iterable([pth for pth in asdf_files if pth in asdfs_to_load]) diff --git a/dkist/dataset/tests/test_load_dataset.py b/dkist/dataset/tests/test_load_dataset.py index 9b7c9f0b..18ca1f27 100644 --- a/dkist/dataset/tests/test_load_dataset.py +++ b/dkist/dataset/tests/test_load_dataset.py @@ -1,3 +1,4 @@ +import re import shutil import numbers @@ -8,6 +9,7 @@ from dkist import Dataset, TiledDataset, load_dataset from dkist.data.test import rootdir +from dkist.dataset.loader import ASDF_FILENAME_PATTERN @pytest.fixture @@ -124,40 +126,52 @@ def generate_asdf_folder(tmp_path, asdf_path, filenames): return tmp_path +@pytest.mark.parametrize(("filename", "match"), [ + ("VBI_L1_20231016T184519_AJQWW.asdf", True), + ("VBI_L1_20231016T184519_AJQWW_user_tools.asdf", True), + ("VBI_L1_20231016T184519_AJQWW_metadata.asdf", True), + ("DL-NIRSP_L1_20231016T184519_AJQWW.asdf", True), + ("DL-NIRSP_L1_20231016T184519_AJQWW_user_tools.asdf", True), + ("DL-NIRSP_L1_20231016T184519_AJQWW_metadata.asdf", True), + ("VISP_L1_99999999T184519_AAAAAAA.asdf", True), + ("VISP_L1_20231016T888888_AAAAAAA_user_tools.asdf", True), + ("VISP_L1_20231016T184519_AAAAAAA_metadata.asdf", True), + ("VISP_L1_20231016T184519_AAAAAAA_unknown.asdf", False), + ("wibble.asdf", False), + ]) +def test_asdf_regex(filename, match): + + m = re.match(ASDF_FILENAME_PATTERN, filename) + assert bool(m) is match + + @pytest.mark.parametrize(("filenames", "indices"), [ # param[0] is list of filenames # parram[1] is the indices in that list that should be used - (("VBI_L1_20231016T184519_AJQWW.asdf",), 0), - (("VBI_L1_20231016T184519_AJQWW_user_tools.asdf",), 0), - (("VBI_L1_20231016T184519_AJQWW_metadata.asdf",), 0), - (("VBI_L1_20231016T184519_AJQWW_unknown.asdf",), 0), - (("VBI_L1_20231016T184519_AJQWW.asdf", - "VBI_L1_20231016T184519_AJQWW_user_tools.asdf",), 1), - (("VBI_L1_20231016T184519_AJQWW.asdf", - "VBI_L1_20231016T184519_AJQWW_user_tools.asdf", - "VBI_L1_20231016T184519_AJQWW_metadata.asdf",), 2), - (("VBI_L1_20231016T184519_AJQWW.asdf", - "VBI_L1_20231016T184519_AJQWW_user_tools.asdf", - "VBI_L1_20231016T184519_AJQWW_metadata.asdf", - "VBI_L1_20231016T184519_AJQWW_unknown.asdf"), (2, 3)), + pytest.param(("VBI_L1_20231016T184519_AJQWW.asdf",), 0, id="Single no suffix"), + pytest.param(("VBI_L1_20231016T184519_AJQWW_user_tools.asdf",), 0, id="single _user_tools"), + pytest.param(("VBI_L1_20231016T184519_AJQWW_metadata.asdf",), 0, id="single _metadata"), + pytest.param(("VBI_L1_20231016T184519_AJQWW_unknown.asdf",), 0, id="single _unknown"), + pytest.param(("VBI_L1_20231016T184519_AJQWW.asdf", + "VBI_L1_20231016T184519_AJQWW_user_tools.asdf",), 1, id="none & _user_tools"), + pytest.param(("VBI_L1_20231016T184519_AJQWW.asdf", + "VBI_L1_20231016T184519_AJQWW_user_tools.asdf", + "VBI_L1_20231016T184519_AJQWW_metadata.asdf",), 2, id="_user_tools & _metadata"), + pytest.param(("VBI_L1_20231016T184519_AJQWW.asdf", + "VBI_L1_20231016T184519_AJQWW_user_tools.asdf", + "VBI_L1_20231016T184519_AJQWW_metadata.asdf", + "VBI_L1_20231016T184519_AJQWW_unknown.asdf"), (2, 3), id="_user_tools & _metadata & _unknown"), + pytest.param(("random.asdf", + "VBI_L1_20231016T184519_AJQWW_user_tools.asdf",), (0, 1), id="other pattern & _user_tools"), + pytest.param(("random.asdf", + "VBI_L1_20231016T184519_AJQWW_user_tools.asdf", + "VBI_L1_20231016T184519_AJQWW_metadata.asdf",), (0, 2), id="other pattern & _user_tools & _metadata"), ]) def test_select_asdf(tmp_path, asdf_path, filenames, indices, mocker): asdf_folder = generate_asdf_folder(tmp_path, asdf_path, filenames) asdf_file_paths = tuple(asdf_folder / fname for fname in filenames) - # First we check that we load the correct amount of datasets and that the - # loading completes correctly - - datasets = load_dataset(asdf_folder) - - if isinstance(indices, numbers.Integral): - assert isinstance(datasets, Dataset) - else: - assert len(datasets) == len(indices) - - # Now we check that the correct files are chosen - load_from_asdf = mocker.patch("dkist.dataset.loader._load_from_asdf") load_from_iterable = mocker.patch("dkist.dataset.loader._load_from_iterable") @@ -165,4 +179,4 @@ def test_select_asdf(tmp_path, asdf_path, filenames, indices, mocker): if isinstance(indices, numbers.Integral): load_from_asdf.assert_called_once_with(asdf_file_paths[indices]) else: - load_from_iterable.assert_called_once_with(tuple(asdf_file_paths[i] for i in indices)) + load_from_iterable.assert_called_once_with([asdf_file_paths[i] for i in indices])