Skip to content

Commit

Permalink
Oh god what have I done
Browse files Browse the repository at this point in the history
  • Loading branch information
Cadair committed Jan 27, 2025
1 parent ce6e6fa commit 87fcae9
Show file tree
Hide file tree
Showing 2 changed files with 86 additions and 27 deletions.
47 changes: 46 additions & 1 deletion dkist/dataset/loader.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import re
import importlib.resources as importlib_resources
from pathlib import Path
from functools import singledispatch
from collections import defaultdict

from parfive import Results

Expand All @@ -14,6 +16,9 @@
from asdf import ValidationError


ASDF_FILENAME_PATTERN = r"^(?P<instrument>[A-Z-]+)_L1_(?P<timestamp>\d{8}T\d{6})_(?P<datasetid>[A-Z]+)(?P<suffix>_user_tools|_metadata)?.asdf$"


def asdf_open_memory_mapping_kwarg(memmap: bool) -> dict:
if asdf.__version__ > "3.1.0":
return {"memmap": memmap}
Expand Down Expand Up @@ -172,7 +177,47 @@ def _load_from_directory(directory):
if len(asdf_files) == 1:
return _load_from_asdf(asdf_files[0])

return _load_from_iterable(asdf_files)
pattern = re.compile(ASDF_FILENAME_PATTERN)
candidates = []
asdfs_to_load = []
for filepath in asdf_files:
filename = filepath.name

# If the asdf file doesn't match the data center pattern then we load it
# as it's probably a custom user file
if pattern.match(filename) is None:
asdfs_to_load.append(filepath)
continue

# All the matches have to be checked
candidates.append(filepath)

# If we only have one match load it
if len(candidates) == 1:
asdfs_to_load += candidates
else:
# Now we group by prefix
matches = [pattern.match(fp.name) for fp in candidates]
grouped = defaultdict(list)
for m in matches:
prefix = m.string.removesuffix(".asdf").removesuffix(m.group("suffix") or "")
grouped[prefix].append(m.group("suffix"))

for prefix, suffixes in grouped.items():
if "_metadata" in suffixes:
asdfs_to_load.append(base_path / f"{prefix}_metadata.asdf")
elif "_user_tools" in suffixes:
asdfs_to_load.append(base_path / f"{prefix}_user_tools.asdf")
elif None in suffixes:
asdfs_to_load.append(base_path / f"{prefix}.asdf")

Check warning on line 212 in dkist/dataset/loader.py

View check run for this annotation

Codecov / codecov/patch

dkist/dataset/loader.py#L211-L212

Added lines #L211 - L212 were not covered by tests
else:
raise ValueError("How did you end up here?")

Check warning on line 214 in dkist/dataset/loader.py

View check run for this annotation

Codecov / codecov/patch

dkist/dataset/loader.py#L214

Added line #L214 was not covered by tests

if len(asdfs_to_load) == 1:
return _load_from_asdf(asdfs_to_load[0])

# Ensure we load in the same order we were passed the files
return _load_from_iterable([pth for pth in asdf_files if pth in asdfs_to_load])



Expand Down
66 changes: 40 additions & 26 deletions dkist/dataset/tests/test_load_dataset.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
import shutil
import numbers

Expand All @@ -8,6 +9,7 @@

from dkist import Dataset, TiledDataset, load_dataset
from dkist.data.test import rootdir
from dkist.dataset.loader import ASDF_FILENAME_PATTERN


@pytest.fixture
Expand Down Expand Up @@ -124,45 +126,57 @@ def generate_asdf_folder(tmp_path, asdf_path, filenames):
return tmp_path


@pytest.mark.parametrize(("filename", "match"), [
("VBI_L1_20231016T184519_AJQWW.asdf", True),
("VBI_L1_20231016T184519_AJQWW_user_tools.asdf", True),
("VBI_L1_20231016T184519_AJQWW_metadata.asdf", True),
("DL-NIRSP_L1_20231016T184519_AJQWW.asdf", True),
("DL-NIRSP_L1_20231016T184519_AJQWW_user_tools.asdf", True),
("DL-NIRSP_L1_20231016T184519_AJQWW_metadata.asdf", True),
("VISP_L1_99999999T184519_AAAAAAA.asdf", True),
("VISP_L1_20231016T888888_AAAAAAA_user_tools.asdf", True),
("VISP_L1_20231016T184519_AAAAAAA_metadata.asdf", True),
("VISP_L1_20231016T184519_AAAAAAA_unknown.asdf", False),
("wibble.asdf", False),
])
def test_asdf_regex(filename, match):

m = re.match(ASDF_FILENAME_PATTERN, filename)
assert bool(m) is match


@pytest.mark.parametrize(("filenames", "indices"), [
# param[0] is list of filenames
# parram[1] is the indices in that list that should be used
(("VBI_L1_20231016T184519_AJQWW.asdf",), 0),
(("VBI_L1_20231016T184519_AJQWW_user_tools.asdf",), 0),
(("VBI_L1_20231016T184519_AJQWW_metadata.asdf",), 0),
(("VBI_L1_20231016T184519_AJQWW_unknown.asdf",), 0),
(("VBI_L1_20231016T184519_AJQWW.asdf",
"VBI_L1_20231016T184519_AJQWW_user_tools.asdf",), 1),
(("VBI_L1_20231016T184519_AJQWW.asdf",
"VBI_L1_20231016T184519_AJQWW_user_tools.asdf",
"VBI_L1_20231016T184519_AJQWW_metadata.asdf",), 2),
(("VBI_L1_20231016T184519_AJQWW.asdf",
"VBI_L1_20231016T184519_AJQWW_user_tools.asdf",
"VBI_L1_20231016T184519_AJQWW_metadata.asdf",
"VBI_L1_20231016T184519_AJQWW_unknown.asdf"), (2, 3)),
pytest.param(("VBI_L1_20231016T184519_AJQWW.asdf",), 0, id="Single no suffix"),
pytest.param(("VBI_L1_20231016T184519_AJQWW_user_tools.asdf",), 0, id="single _user_tools"),
pytest.param(("VBI_L1_20231016T184519_AJQWW_metadata.asdf",), 0, id="single _metadata"),
pytest.param(("VBI_L1_20231016T184519_AJQWW_unknown.asdf",), 0, id="single _unknown"),
pytest.param(("VBI_L1_20231016T184519_AJQWW.asdf",
"VBI_L1_20231016T184519_AJQWW_user_tools.asdf",), 1, id="none & _user_tools"),
pytest.param(("VBI_L1_20231016T184519_AJQWW.asdf",
"VBI_L1_20231016T184519_AJQWW_user_tools.asdf",
"VBI_L1_20231016T184519_AJQWW_metadata.asdf",), 2, id="_user_tools & _metadata"),
pytest.param(("VBI_L1_20231016T184519_AJQWW.asdf",
"VBI_L1_20231016T184519_AJQWW_user_tools.asdf",
"VBI_L1_20231016T184519_AJQWW_metadata.asdf",
"VBI_L1_20231016T184519_AJQWW_unknown.asdf"), (2, 3), id="_user_tools & _metadata & _unknown"),
pytest.param(("random.asdf",
"VBI_L1_20231016T184519_AJQWW_user_tools.asdf",), (0, 1), id="other pattern & _user_tools"),
pytest.param(("random.asdf",
"VBI_L1_20231016T184519_AJQWW_user_tools.asdf",
"VBI_L1_20231016T184519_AJQWW_metadata.asdf",), (0, 2), id="other pattern & _user_tools & _metadata"),
])
def test_select_asdf(tmp_path, asdf_path, filenames, indices, mocker):
asdf_folder = generate_asdf_folder(tmp_path, asdf_path, filenames)

asdf_file_paths = tuple(asdf_folder / fname for fname in filenames)

# First we check that we load the correct amount of datasets and that the
# loading completes correctly

datasets = load_dataset(asdf_folder)

if isinstance(indices, numbers.Integral):
assert isinstance(datasets, Dataset)
else:
assert len(datasets) == len(indices)

# Now we check that the correct files are chosen

load_from_asdf = mocker.patch("dkist.dataset.loader._load_from_asdf")
load_from_iterable = mocker.patch("dkist.dataset.loader._load_from_iterable")

datasets = load_dataset(asdf_folder)
if isinstance(indices, numbers.Integral):
load_from_asdf.assert_called_once_with(asdf_file_paths[indices])
else:
load_from_iterable.assert_called_once_with(tuple(asdf_file_paths[i] for i in indices))
load_from_iterable.assert_called_once_with([asdf_file_paths[i] for i in indices])

0 comments on commit 87fcae9

Please sign in to comment.