From 2c274d985ba3089ed9084281083b11254f791d05 Mon Sep 17 00:00:00 2001 From: Stuart Mumford Date: Tue, 28 Jan 2025 16:03:56 +0000 Subject: [PATCH] Fixing loading ASDF files from a directory (#503) * Add a regression test and docs for new path loader [ci skip] * Oh god what have I done * Better ordering invariance * Add a partial match test * Add a warning for any skipped files * Add changelog * This should never be hit * Add more test cases of multiple patterns * DKIST dataset IDs are 5 or more uppercase letters * Dataset IDs must be at least 5 characters * Add a regex test for <5 char dataset ids * More review comments --------- Co-authored-by: Fraser Watson --- changelog/503.bugfix.rst | 1 + dkist/dataset/loader.py | 92 ++++++++++++++++++++++-- dkist/dataset/tests/test_load_dataset.py | 87 ++++++++++++++++++++++ 3 files changed, 173 insertions(+), 7 deletions(-) create mode 100644 changelog/503.bugfix.rst diff --git a/changelog/503.bugfix.rst b/changelog/503.bugfix.rst new file mode 100644 index 00000000..6008e571 --- /dev/null +++ b/changelog/503.bugfix.rst @@ -0,0 +1 @@ +Improve the ASDF detection code so out of date ASDF filenames generated by the DKIST data center are skipped if a newer filename is present. diff --git a/dkist/dataset/loader.py b/dkist/dataset/loader.py index 1b1f4ea0..4c1aaecb 100644 --- a/dkist/dataset/loader.py +++ b/dkist/dataset/loader.py @@ -1,11 +1,16 @@ +import re +import warnings import importlib.resources as importlib_resources from pathlib import Path from functools import singledispatch +from collections import defaultdict from parfive import Results import asdf +from dkist.utils.exceptions import DKISTUserWarning + try: # first try to import from asdf.exceptions for asdf 2.15+ from asdf.exceptions import ValidationError @@ -14,6 +19,9 @@ from asdf import ValidationError +ASDF_FILENAME_PATTERN = r"^(?P[A-Z-]+)_L1_(?P\d{8}T\d{6})_(?P[A-Z]{5,})(?P_user_tools|_metadata)?.asdf$" + + def asdf_open_memory_mapping_kwarg(memmap: bool) -> dict: if asdf.__version__ > "3.1.0": return {"memmap": memmap} @@ -138,8 +146,30 @@ def _load_from_path(path: Path): def _load_from_directory(directory): """ - Construct a `~dkist.dataset.Dataset` from a directory containing one - asdf file and a collection of FITS files. + Construct a `~dkist.dataset.Dataset` from a directory containing one (or + more) ASDF files and a collection of FITS files. + + ASDF files have the generic pattern: + + ``{instrument}_L1_{start_time:%Y%m%dT%H%M%S}_{dataset_id}[_{suffix}].asdf`` + + where the ``_{suffix}`` on the end may be absent or one of a few different + suffixes which have been used at different times. When searching a + directory for one or more ASDF file to load we should attempt to only load + one per dataset ID by selecting files in suffix order. + + The order of suffixes are (from newest used to oldest): + + - ``_metadata`` + - ``_user_tools`` + - None + + The algorithm used to find ASDF files to load in a directory is therefore: + + - Glob the directory for all ASDF files + - Group all results by the filename up to and including the dataset id in the filename + - Ignore any ASDF files with an old suffix if a new suffix is present + - Throw a warning to the user if any ASDF files with older suffixes are found """ base_path = Path(directory).expanduser() asdf_files = tuple(base_path.glob("*.asdf")) @@ -147,12 +177,60 @@ def _load_from_directory(directory): if not asdf_files: raise ValueError(f"No asdf file found in directory {base_path}.") - if len(asdf_files) > 1: - return _load_from_iterable(asdf_files) - - asdf_file = asdf_files[0] + if len(asdf_files) == 1: + return _load_from_asdf(asdf_files[0]) + + pattern = re.compile(ASDF_FILENAME_PATTERN) + candidates = [] + asdfs_to_load = [] + for filepath in asdf_files: + filename = filepath.name + + # If the asdf file doesn't match the data center pattern then we load it + # as it's probably a custom user file + if pattern.match(filename) is None: + asdfs_to_load.append(filepath) + continue + + # All the matches have to be checked + candidates.append(filepath) + + # If we only have one match load it + if len(candidates) == 1: + asdfs_to_load += candidates + else: + # Now we group by prefix + matches = [pattern.match(fp.name) for fp in candidates] + grouped = defaultdict(list) + for m in matches: + prefix = m.string.removesuffix(".asdf").removesuffix(m.group("suffix") or "") + grouped[prefix].append(m.group("suffix")) + + # Now we select the best suffix for each prefix + for prefix, suffixes in grouped.items(): + if "_metadata" in suffixes: + asdfs_to_load.append(base_path / f"{prefix}_metadata.asdf") + elif "_user_tools" in suffixes: + asdfs_to_load.append(base_path / f"{prefix}_user_tools.asdf") + elif None in suffixes: + asdfs_to_load.append(base_path / f"{prefix}.asdf") + else: + # This branch should never be hit because the regex enumerates the suffixes + raise ValueError("Unknown suffix encountered.") # pragma: no cover + + # Throw a warning if we have skipped any files + if ignored_files := set(asdf_files).difference(asdfs_to_load): + warnings.warn( + f"ASDF files with old names ({', '.join([a.name for a in ignored_files])}) " + "were found in this directory and ignored. You may want to delete these files.", + DKISTUserWarning + ) + + if len(asdfs_to_load) == 1: + return _load_from_asdf(asdfs_to_load[0]) + + return _load_from_iterable(asdfs_to_load) - return _load_from_asdf(asdf_file) def _load_from_asdf(filepath): diff --git a/dkist/dataset/tests/test_load_dataset.py b/dkist/dataset/tests/test_load_dataset.py index bc24a147..b98ab009 100644 --- a/dkist/dataset/tests/test_load_dataset.py +++ b/dkist/dataset/tests/test_load_dataset.py @@ -1,4 +1,6 @@ +import re import shutil +import numbers import pytest from parfive import Results @@ -7,6 +9,8 @@ from dkist import Dataset, TiledDataset, load_dataset from dkist.data.test import rootdir +from dkist.dataset.loader import ASDF_FILENAME_PATTERN +from dkist.utils.exceptions import DKISTUserWarning @pytest.fixture @@ -114,3 +118,86 @@ def test_not_dkist_asdf(tmp_path): with pytest.raises(TypeError, match="not a valid DKIST"): load_dataset(tmp_path / "test.asdf") + + +def generate_asdf_folder(tmp_path, asdf_path, filenames): + for fname in filenames: + shutil.copy(asdf_path, tmp_path / fname) + + return tmp_path + + +@pytest.mark.parametrize(("filename", "match"), [ + ("VBI_L1_20231016T184519_AJQWW.asdf", True), + ("VBI_L1_20231016T184519_AAAA.asdf", False), + ("VBI_L1_20231016T184519_AJQWW_user_tools.asdf", True), + ("VBI_L1_20231016T184519_AJQWW_metadata.asdf", True), + ("DL-NIRSP_L1_20231016T184519_AJQWW.asdf", True), + ("DL-NIRSP_L1_20231016T184519_AJQWW_user_tools.asdf", True), + ("DL-NIRSP_L1_20231016T184519_AJQWW_metadata.asdf", True), + ("VISP_L1_99999999T184519_AAAAAAA.asdf", True), + ("VISP_L1_20231016T888888_AAAAAAA_user_tools.asdf", True), + ("VISP_L1_20231016T184519_AAAAAAA_metadata.asdf", True), + ("VISP_L1_20231016T184519_AAAAAAA_unknown.asdf", False), + ("VISP_L1_20231016T184519.asdf", False), + ("wibble.asdf", False), + ]) +def test_asdf_regex(filename, match): + m = re.match(ASDF_FILENAME_PATTERN, filename) + assert bool(m) is match + + +@pytest.mark.parametrize(("filenames", "indices"), [ + pytest.param(("VBI_L1_20231016T184519_AJQWW.asdf",), 0, id="Single no suffix"), + pytest.param(("VBI_L1_20231016T184519_AJQWW_user_tools.asdf",), 0, id="single _user_tools"), + pytest.param(("VBI_L1_20231016T184519_AJQWW_metadata.asdf",), 0, id="single _metadata"), + pytest.param(("VBI_L1_20231016T184519_AJQWW_unknown.asdf",), 0, id="single _unknown"), + pytest.param(("VBI_L1_20231016T184519_AJQWW.asdf", + "VBI_L1_20231016T184519_AJQWW_user_tools.asdf",), 1, id="none & _user_tools"), + pytest.param(("VBI_L1_20231016T184519_AJQWW.asdf", + "VBI_L1_20231016T184519_AJQWW_user_tools.asdf", + "VBI_L1_20231016T184519_AJQWW_metadata.asdf",), 2, id="_user_tools & _metadata"), + pytest.param(("VBI_L1_20231016T184519_AJQWW.asdf", + "VBI_L1_20231016T184519_AJQWW_user_tools.asdf", + "VBI_L1_20231016T184519_AJQWW_metadata.asdf", + "VBI_L1_20231016T184519_AJQWW_unknown.asdf"), (2, 3), id="_user_tools & _metadata & _unknown"), + pytest.param(("random.asdf", + "VBI_L1_20231016T184519_AJQWW_user_tools.asdf",), (0, 1), id="other pattern & _user_tools"), + pytest.param(("random.asdf", + "VBI_L1_not_a_proper_name.asdf", + "VBI_L1_20231016T184519_AJQWW_user_tools.asdf", + "VBI_L1_20231016T184519_AJQWW_metadata.asdf",), (0, 1, 3), id="2 other patterns & _user_tools & _metadata"), + pytest.param(("VBI_L1_20231016T184519_AJQWW.asdf", + "VISP_L1_20231016T184519_AJQWW.asdf",), (0, 1), id="Two patterns, no suffix"), + pytest.param(("VBI_L1_20231016T184519_AAAAA.asdf", + "VBI_L1_20231016T184519_AAAAA_metadata.asdf", + "VBI_L1_20231116T184519_BBBBBBB.asdf", + "VBI_L1_20231216T184519_CCCCCCC.asdf", + "VBI_L1_20231216T184519_CCCCCCC_user_tools.asdf"), (1, 2, 4), id="Three patterns, mixed suffixes"), +]) +def test_select_asdf(tmp_path, asdf_path, filenames, indices, mocker): + asdf_folder = generate_asdf_folder(tmp_path, asdf_path, filenames) + + asdf_file_paths = tuple(asdf_folder / fname for fname in filenames) + + load_from_asdf = mocker.patch("dkist.dataset.loader._load_from_asdf") + load_from_iterable = mocker.patch("dkist.dataset.loader._load_from_iterable") + + # The load_dataset call should throw a warning if any files are skipped, but + # not otherwise, the warning should have the filenames of any skipped files in + tuple_of_indices = indices if isinstance(indices, tuple) else (indices,) + if len(tuple_of_indices) == len(filenames): + datasets = load_dataset(asdf_folder) + else: + files_to_be_skipped = set(filenames).difference([filenames[i] for i in tuple_of_indices]) + with pytest.warns(DKISTUserWarning, match=f".*[{'|'.join([re.escape(f) for f in files_to_be_skipped])}].*"): + datasets = load_dataset(asdf_folder) + + if isinstance(indices, numbers.Integral): + load_from_asdf.assert_called_once_with(asdf_file_paths[indices]) + else: + calls = load_from_iterable.mock_calls + # We need to assert that _load_from_iterable is called with the right + # paths but in a order-invariant way. + assert len(calls) == 1 + assert set(calls[0].args[0]) == {asdf_file_paths[i] for i in indices}