Skip to content

Commit

Permalink
Fixing loading ASDF files from a directory (#503)
Browse files Browse the repository at this point in the history
* Add a regression test and docs for new path loader

[ci skip]

* Oh god what have I done

* Better ordering invariance

* Add a partial match test

* Add a warning for any skipped files

* Add changelog

* This should never be hit

* Add more test cases of multiple patterns

* DKIST dataset IDs are 5 or more uppercase letters

* Dataset IDs must be at least 5 characters

* Add a regex test for <5 char dataset ids

* More review comments

---------

Co-authored-by: Fraser Watson <[email protected]>
Cadair and fraserwatson committed Jan 29, 2025
1 parent 534ba5f commit 2c274d9
Showing 3 changed files with 173 additions and 7 deletions.
1 change: 1 addition & 0 deletions changelog/503.bugfix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Improve the ASDF detection code so out of date ASDF filenames generated by the DKIST data center are skipped if a newer filename is present.
92 changes: 85 additions & 7 deletions dkist/dataset/loader.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
import re
import warnings
import importlib.resources as importlib_resources
from pathlib import Path
from functools import singledispatch
from collections import defaultdict

from parfive import Results

import asdf

from dkist.utils.exceptions import DKISTUserWarning

try:
# first try to import from asdf.exceptions for asdf 2.15+
from asdf.exceptions import ValidationError
@@ -14,6 +19,9 @@
from asdf import ValidationError


ASDF_FILENAME_PATTERN = r"^(?P<instrument>[A-Z-]+)_L1_(?P<timestamp>\d{8}T\d{6})_(?P<datasetid>[A-Z]{5,})(?P<suffix>_user_tools|_metadata)?.asdf$"


def asdf_open_memory_mapping_kwarg(memmap: bool) -> dict:
if asdf.__version__ > "3.1.0":
return {"memmap": memmap}
@@ -138,21 +146,91 @@ def _load_from_path(path: Path):

def _load_from_directory(directory):
"""
Construct a `~dkist.dataset.Dataset` from a directory containing one
asdf file and a collection of FITS files.
Construct a `~dkist.dataset.Dataset` from a directory containing one (or
more) ASDF files and a collection of FITS files.
ASDF files have the generic pattern:
``{instrument}_L1_{start_time:%Y%m%dT%H%M%S}_{dataset_id}[_{suffix}].asdf``
where the ``_{suffix}`` on the end may be absent or one of a few different
suffixes which have been used at different times. When searching a
directory for one or more ASDF file to load we should attempt to only load
one per dataset ID by selecting files in suffix order.
The order of suffixes are (from newest used to oldest):
- ``_metadata``
- ``_user_tools``
- None
The algorithm used to find ASDF files to load in a directory is therefore:
- Glob the directory for all ASDF files
- Group all results by the filename up to and including the dataset id in the filename
- Ignore any ASDF files with an old suffix if a new suffix is present
- Throw a warning to the user if any ASDF files with older suffixes are found
"""
base_path = Path(directory).expanduser()
asdf_files = tuple(base_path.glob("*.asdf"))

if not asdf_files:
raise ValueError(f"No asdf file found in directory {base_path}.")

if len(asdf_files) > 1:
return _load_from_iterable(asdf_files)

asdf_file = asdf_files[0]
if len(asdf_files) == 1:
return _load_from_asdf(asdf_files[0])

pattern = re.compile(ASDF_FILENAME_PATTERN)
candidates = []
asdfs_to_load = []
for filepath in asdf_files:
filename = filepath.name

# If the asdf file doesn't match the data center pattern then we load it
# as it's probably a custom user file
if pattern.match(filename) is None:
asdfs_to_load.append(filepath)
continue

# All the matches have to be checked
candidates.append(filepath)

# If we only have one match load it
if len(candidates) == 1:
asdfs_to_load += candidates
else:
# Now we group by prefix
matches = [pattern.match(fp.name) for fp in candidates]
grouped = defaultdict(list)
for m in matches:
prefix = m.string.removesuffix(".asdf").removesuffix(m.group("suffix") or "")
grouped[prefix].append(m.group("suffix"))

# Now we select the best suffix for each prefix
for prefix, suffixes in grouped.items():
if "_metadata" in suffixes:
asdfs_to_load.append(base_path / f"{prefix}_metadata.asdf")
elif "_user_tools" in suffixes:
asdfs_to_load.append(base_path / f"{prefix}_user_tools.asdf")
elif None in suffixes:
asdfs_to_load.append(base_path / f"{prefix}.asdf")
else:
# This branch should never be hit because the regex enumerates the suffixes
raise ValueError("Unknown suffix encountered.") # pragma: no cover

# Throw a warning if we have skipped any files
if ignored_files := set(asdf_files).difference(asdfs_to_load):
warnings.warn(
f"ASDF files with old names ({', '.join([a.name for a in ignored_files])}) "
"were found in this directory and ignored. You may want to delete these files.",
DKISTUserWarning
)

if len(asdfs_to_load) == 1:
return _load_from_asdf(asdfs_to_load[0])

return _load_from_iterable(asdfs_to_load)

return _load_from_asdf(asdf_file)


def _load_from_asdf(filepath):
87 changes: 87 additions & 0 deletions dkist/dataset/tests/test_load_dataset.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import re
import shutil
import numbers

import pytest
from parfive import Results
@@ -7,6 +9,8 @@

from dkist import Dataset, TiledDataset, load_dataset
from dkist.data.test import rootdir
from dkist.dataset.loader import ASDF_FILENAME_PATTERN
from dkist.utils.exceptions import DKISTUserWarning


@pytest.fixture
@@ -114,3 +118,86 @@ def test_not_dkist_asdf(tmp_path):

with pytest.raises(TypeError, match="not a valid DKIST"):
load_dataset(tmp_path / "test.asdf")


def generate_asdf_folder(tmp_path, asdf_path, filenames):
for fname in filenames:
shutil.copy(asdf_path, tmp_path / fname)

return tmp_path


@pytest.mark.parametrize(("filename", "match"), [
("VBI_L1_20231016T184519_AJQWW.asdf", True),
("VBI_L1_20231016T184519_AAAA.asdf", False),
("VBI_L1_20231016T184519_AJQWW_user_tools.asdf", True),
("VBI_L1_20231016T184519_AJQWW_metadata.asdf", True),
("DL-NIRSP_L1_20231016T184519_AJQWW.asdf", True),
("DL-NIRSP_L1_20231016T184519_AJQWW_user_tools.asdf", True),
("DL-NIRSP_L1_20231016T184519_AJQWW_metadata.asdf", True),
("VISP_L1_99999999T184519_AAAAAAA.asdf", True),
("VISP_L1_20231016T888888_AAAAAAA_user_tools.asdf", True),
("VISP_L1_20231016T184519_AAAAAAA_metadata.asdf", True),
("VISP_L1_20231016T184519_AAAAAAA_unknown.asdf", False),
("VISP_L1_20231016T184519.asdf", False),
("wibble.asdf", False),
])
def test_asdf_regex(filename, match):
m = re.match(ASDF_FILENAME_PATTERN, filename)
assert bool(m) is match


@pytest.mark.parametrize(("filenames", "indices"), [
pytest.param(("VBI_L1_20231016T184519_AJQWW.asdf",), 0, id="Single no suffix"),
pytest.param(("VBI_L1_20231016T184519_AJQWW_user_tools.asdf",), 0, id="single _user_tools"),
pytest.param(("VBI_L1_20231016T184519_AJQWW_metadata.asdf",), 0, id="single _metadata"),
pytest.param(("VBI_L1_20231016T184519_AJQWW_unknown.asdf",), 0, id="single _unknown"),
pytest.param(("VBI_L1_20231016T184519_AJQWW.asdf",
"VBI_L1_20231016T184519_AJQWW_user_tools.asdf",), 1, id="none & _user_tools"),
pytest.param(("VBI_L1_20231016T184519_AJQWW.asdf",
"VBI_L1_20231016T184519_AJQWW_user_tools.asdf",
"VBI_L1_20231016T184519_AJQWW_metadata.asdf",), 2, id="_user_tools & _metadata"),
pytest.param(("VBI_L1_20231016T184519_AJQWW.asdf",
"VBI_L1_20231016T184519_AJQWW_user_tools.asdf",
"VBI_L1_20231016T184519_AJQWW_metadata.asdf",
"VBI_L1_20231016T184519_AJQWW_unknown.asdf"), (2, 3), id="_user_tools & _metadata & _unknown"),
pytest.param(("random.asdf",
"VBI_L1_20231016T184519_AJQWW_user_tools.asdf",), (0, 1), id="other pattern & _user_tools"),
pytest.param(("random.asdf",
"VBI_L1_not_a_proper_name.asdf",
"VBI_L1_20231016T184519_AJQWW_user_tools.asdf",
"VBI_L1_20231016T184519_AJQWW_metadata.asdf",), (0, 1, 3), id="2 other patterns & _user_tools & _metadata"),
pytest.param(("VBI_L1_20231016T184519_AJQWW.asdf",
"VISP_L1_20231016T184519_AJQWW.asdf",), (0, 1), id="Two patterns, no suffix"),
pytest.param(("VBI_L1_20231016T184519_AAAAA.asdf",
"VBI_L1_20231016T184519_AAAAA_metadata.asdf",
"VBI_L1_20231116T184519_BBBBBBB.asdf",
"VBI_L1_20231216T184519_CCCCCCC.asdf",
"VBI_L1_20231216T184519_CCCCCCC_user_tools.asdf"), (1, 2, 4), id="Three patterns, mixed suffixes"),
])
def test_select_asdf(tmp_path, asdf_path, filenames, indices, mocker):
asdf_folder = generate_asdf_folder(tmp_path, asdf_path, filenames)

asdf_file_paths = tuple(asdf_folder / fname for fname in filenames)

load_from_asdf = mocker.patch("dkist.dataset.loader._load_from_asdf")
load_from_iterable = mocker.patch("dkist.dataset.loader._load_from_iterable")

# The load_dataset call should throw a warning if any files are skipped, but
# not otherwise, the warning should have the filenames of any skipped files in
tuple_of_indices = indices if isinstance(indices, tuple) else (indices,)
if len(tuple_of_indices) == len(filenames):
datasets = load_dataset(asdf_folder)
else:
files_to_be_skipped = set(filenames).difference([filenames[i] for i in tuple_of_indices])
with pytest.warns(DKISTUserWarning, match=f".*[{'|'.join([re.escape(f) for f in files_to_be_skipped])}].*"):
datasets = load_dataset(asdf_folder)

if isinstance(indices, numbers.Integral):
load_from_asdf.assert_called_once_with(asdf_file_paths[indices])
else:
calls = load_from_iterable.mock_calls
# We need to assert that _load_from_iterable is called with the right
# paths but in a order-invariant way.
assert len(calls) == 1
assert set(calls[0].args[0]) == {asdf_file_paths[i] for i in indices}

0 comments on commit 2c274d9

Please sign in to comment.