From 2c274d985ba3089ed9084281083b11254f791d05 Mon Sep 17 00:00:00 2001
From: Stuart Mumford <stuart@cadair.com>
Date: Tue, 28 Jan 2025 16:03:56 +0000
Subject: [PATCH] Fixing loading ASDF files from a directory (#503)

* Add a regression test and docs for new path loader

[ci skip]

* Oh god what have I done

* Better ordering invariance

* Add a partial match test

* Add a warning for any skipped files

* Add changelog

* This should never be hit

* Add more test cases of multiple patterns

* DKIST dataset IDs are 5 or more uppercase letters

* Dataset IDs must be at least 5 characters

* Add a regex test for <5 char dataset ids

* More review comments

---------

Co-authored-by: Fraser Watson <fwatson@nso.edu>
---
 changelog/503.bugfix.rst                 |  1 +
 dkist/dataset/loader.py                  | 92 ++++++++++++++++++++++--
 dkist/dataset/tests/test_load_dataset.py | 87 ++++++++++++++++++++++
 3 files changed, 173 insertions(+), 7 deletions(-)
 create mode 100644 changelog/503.bugfix.rst
diff --git a/changelog/503.bugfix.rst b/changelog/503.bugfix.rst
new file mode 100644
index 00000000..6008e571
--- /dev/null
+++ b/changelog/503.bugfix.rst
@@ -0,0 +1 @@
+Improve the ASDF detection code so out of date ASDF filenames generated by the DKIST data center are skipped if a newer filename is present.
diff --git a/dkist/dataset/loader.py b/dkist/dataset/loader.py
index 1b1f4ea0..4c1aaecb 100644
--- a/dkist/dataset/loader.py
+++ b/dkist/dataset/loader.py
@@ -1,11 +1,16 @@
+import re
+import warnings
 import importlib.resources as importlib_resources
 from pathlib import Path
 from functools import singledispatch
+from collections import defaultdict
 
 from parfive import Results
 
 import asdf
 
+from dkist.utils.exceptions import DKISTUserWarning
+
 try:
     # first try to import from asdf.exceptions for asdf 2.15+
     from asdf.exceptions import ValidationError
@@ -14,6 +19,9 @@
     from asdf import ValidationError
 
 
+ASDF_FILENAME_PATTERN = r"^(?P<instrument>[A-Z-]+)_L1_(?P<timestamp>\d{8}T\d{6})_(?P<datasetid>[A-Z]{5,})(?P<suffix>_user_tools|_metadata)?.asdf$"
+
+
 def asdf_open_memory_mapping_kwarg(memmap: bool) -> dict:
     if asdf.__version__ > "3.1.0":
         return {"memmap": memmap}
@@ -138,8 +146,30 @@ def _load_from_path(path: Path):
 
 def _load_from_directory(directory):
     """
-    Construct a `~dkist.dataset.Dataset` from a directory containing one
-    asdf file and a collection of FITS files.
+    Construct a `~dkist.dataset.Dataset` from a directory containing one (or
+    more) ASDF files and a collection of FITS files.
+
+    ASDF files have the generic pattern:
+
+    ``{instrument}_L1_{start_time:%Y%m%dT%H%M%S}_{dataset_id}[_{suffix}].asdf``
+
+    where the ``_{suffix}`` on the end may be absent or one of a few different
+    suffixes which have been used at different times.  When searching a
+    directory for one or more ASDF file to load we should attempt to only load
+    one per dataset ID by selecting files in suffix order.
+
+    The order of suffixes are (from newest used to oldest):
+
+    - ``_metadata``
+    - ``_user_tools``
+    - None
+
+    The algorithm used to find ASDF files to load in a directory is therefore:
+
+    - Glob the directory for all ASDF files
+    - Group all results by the filename up to and including the dataset id in the filename
+    - Ignore any ASDF files with an old suffix if a new suffix is present
+    - Throw a warning to the user if any ASDF files with older suffixes are found
     """
     base_path = Path(directory).expanduser()
     asdf_files = tuple(base_path.glob("*.asdf"))
@@ -147,12 +177,60 @@ def _load_from_directory(directory):
     if not asdf_files:
         raise ValueError(f"No asdf file found in directory {base_path}.")
 
-    if len(asdf_files) > 1:
-        return _load_from_iterable(asdf_files)
-
-    asdf_file = asdf_files[0]
+    if len(asdf_files) == 1:
+        return _load_from_asdf(asdf_files[0])
+
+    pattern = re.compile(ASDF_FILENAME_PATTERN)
+    candidates = []
+    asdfs_to_load = []
+    for filepath in asdf_files:
+        filename = filepath.name
+
+        # If the asdf file doesn't match the data center pattern then we load it
+        # as it's probably a custom user file
+        if pattern.match(filename) is None:
+            asdfs_to_load.append(filepath)
+            continue
+
+        # All the matches have to be checked
+        candidates.append(filepath)
+
+    # If we only have one match load it
+    if len(candidates) == 1:
+        asdfs_to_load += candidates
+    else:
+        # Now we group by prefix
+        matches = [pattern.match(fp.name) for fp in candidates]
+        grouped = defaultdict(list)
+        for m in matches:
+            prefix = m.string.removesuffix(".asdf").removesuffix(m.group("suffix") or "")
+            grouped[prefix].append(m.group("suffix"))
+
+        # Now we select the best suffix for each prefix
+        for prefix, suffixes in grouped.items():
+            if "_metadata" in suffixes:
+                asdfs_to_load.append(base_path / f"{prefix}_metadata.asdf")
+            elif "_user_tools" in suffixes:
+                asdfs_to_load.append(base_path / f"{prefix}_user_tools.asdf")
+            elif None in suffixes:
+                asdfs_to_load.append(base_path / f"{prefix}.asdf")
+            else:
+                # This branch should never be hit because the regex enumerates the suffixes
+                raise ValueError("Unknown suffix encountered.")  # pragma: no cover
+
+    # Throw a warning if we have skipped any files
+    if ignored_files := set(asdf_files).difference(asdfs_to_load):
+        warnings.warn(
+            f"ASDF files with old names ({', '.join([a.name for a in ignored_files])}) "
+            "were found in this directory and ignored. You may want to delete these files.",
+            DKISTUserWarning
+        )
+
+    if len(asdfs_to_load) == 1:
+        return _load_from_asdf(asdfs_to_load[0])
+
+    return _load_from_iterable(asdfs_to_load)
 
-    return _load_from_asdf(asdf_file)
 
 
 def _load_from_asdf(filepath):
diff --git a/dkist/dataset/tests/test_load_dataset.py b/dkist/dataset/tests/test_load_dataset.py
index bc24a147..b98ab009 100644
--- a/dkist/dataset/tests/test_load_dataset.py
+++ b/dkist/dataset/tests/test_load_dataset.py
@@ -1,4 +1,6 @@
+import re
 import shutil
+import numbers
 
 import pytest
 from parfive import Results
@@ -7,6 +9,8 @@
 
 from dkist import Dataset, TiledDataset, load_dataset
 from dkist.data.test import rootdir
+from dkist.dataset.loader import ASDF_FILENAME_PATTERN
+from dkist.utils.exceptions import DKISTUserWarning
 
 
 @pytest.fixture
@@ -114,3 +118,86 @@ def test_not_dkist_asdf(tmp_path):
 
     with pytest.raises(TypeError, match="not a valid DKIST"):
         load_dataset(tmp_path / "test.asdf")
+
+
+def generate_asdf_folder(tmp_path, asdf_path, filenames):
+    for fname in filenames:
+        shutil.copy(asdf_path, tmp_path / fname)
+
+    return tmp_path
+
+
+@pytest.mark.parametrize(("filename", "match"), [
+    ("VBI_L1_20231016T184519_AJQWW.asdf", True),
+    ("VBI_L1_20231016T184519_AAAA.asdf", False),
+    ("VBI_L1_20231016T184519_AJQWW_user_tools.asdf", True),
+    ("VBI_L1_20231016T184519_AJQWW_metadata.asdf", True),
+    ("DL-NIRSP_L1_20231016T184519_AJQWW.asdf", True),
+    ("DL-NIRSP_L1_20231016T184519_AJQWW_user_tools.asdf", True),
+    ("DL-NIRSP_L1_20231016T184519_AJQWW_metadata.asdf", True),
+    ("VISP_L1_99999999T184519_AAAAAAA.asdf", True),
+    ("VISP_L1_20231016T888888_AAAAAAA_user_tools.asdf", True),
+    ("VISP_L1_20231016T184519_AAAAAAA_metadata.asdf", True),
+    ("VISP_L1_20231016T184519_AAAAAAA_unknown.asdf", False),
+    ("VISP_L1_20231016T184519.asdf", False),
+    ("wibble.asdf", False),
+    ])
+def test_asdf_regex(filename, match):
+    m = re.match(ASDF_FILENAME_PATTERN, filename)
+    assert bool(m) is match
+
+
+@pytest.mark.parametrize(("filenames", "indices"), [
+    pytest.param(("VBI_L1_20231016T184519_AJQWW.asdf",), 0, id="Single no suffix"),
+    pytest.param(("VBI_L1_20231016T184519_AJQWW_user_tools.asdf",), 0, id="single _user_tools"),
+    pytest.param(("VBI_L1_20231016T184519_AJQWW_metadata.asdf",), 0, id="single _metadata"),
+    pytest.param(("VBI_L1_20231016T184519_AJQWW_unknown.asdf",), 0, id="single _unknown"),
+    pytest.param(("VBI_L1_20231016T184519_AJQWW.asdf",
+                  "VBI_L1_20231016T184519_AJQWW_user_tools.asdf",), 1, id="none & _user_tools"),
+    pytest.param(("VBI_L1_20231016T184519_AJQWW.asdf",
+                  "VBI_L1_20231016T184519_AJQWW_user_tools.asdf",
+                  "VBI_L1_20231016T184519_AJQWW_metadata.asdf",), 2, id="_user_tools & _metadata"),
+    pytest.param(("VBI_L1_20231016T184519_AJQWW.asdf",
+                  "VBI_L1_20231016T184519_AJQWW_user_tools.asdf",
+                  "VBI_L1_20231016T184519_AJQWW_metadata.asdf",
+                  "VBI_L1_20231016T184519_AJQWW_unknown.asdf"), (2, 3), id="_user_tools & _metadata & _unknown"),
+    pytest.param(("random.asdf",
+                  "VBI_L1_20231016T184519_AJQWW_user_tools.asdf",), (0, 1), id="other pattern & _user_tools"),
+    pytest.param(("random.asdf",
+                  "VBI_L1_not_a_proper_name.asdf",
+                  "VBI_L1_20231016T184519_AJQWW_user_tools.asdf",
+                  "VBI_L1_20231016T184519_AJQWW_metadata.asdf",), (0, 1, 3), id="2 other patterns & _user_tools & _metadata"),
+    pytest.param(("VBI_L1_20231016T184519_AJQWW.asdf",
+                  "VISP_L1_20231016T184519_AJQWW.asdf",), (0, 1), id="Two patterns, no suffix"),
+    pytest.param(("VBI_L1_20231016T184519_AAAAA.asdf",
+                  "VBI_L1_20231016T184519_AAAAA_metadata.asdf",
+                  "VBI_L1_20231116T184519_BBBBBBB.asdf",
+                  "VBI_L1_20231216T184519_CCCCCCC.asdf",
+                  "VBI_L1_20231216T184519_CCCCCCC_user_tools.asdf"), (1, 2, 4), id="Three patterns, mixed suffixes"),
+])
+def test_select_asdf(tmp_path, asdf_path, filenames, indices, mocker):
+    asdf_folder = generate_asdf_folder(tmp_path, asdf_path, filenames)
+
+    asdf_file_paths = tuple(asdf_folder / fname for fname in filenames)
+
+    load_from_asdf = mocker.patch("dkist.dataset.loader._load_from_asdf")
+    load_from_iterable = mocker.patch("dkist.dataset.loader._load_from_iterable")
+
+    # The load_dataset call should throw a warning if any files are skipped, but
+    # not otherwise, the warning should have the filenames of any skipped files in
+    tuple_of_indices = indices if isinstance(indices, tuple) else (indices,)
+    if len(tuple_of_indices) == len(filenames):
+        datasets = load_dataset(asdf_folder)
+    else:
+        files_to_be_skipped = set(filenames).difference([filenames[i] for i in tuple_of_indices])
+        with pytest.warns(DKISTUserWarning, match=f".*[{'|'.join([re.escape(f) for f in files_to_be_skipped])}].*"):
+            datasets = load_dataset(asdf_folder)
+
+    if isinstance(indices, numbers.Integral):
+        load_from_asdf.assert_called_once_with(asdf_file_paths[indices])
+    else:
+        calls = load_from_iterable.mock_calls
+        # We need to assert that _load_from_iterable is called with the right
+        # paths but in a order-invariant way.
+        assert len(calls) == 1
+        assert set(calls[0].args[0]) == {asdf_file_paths[i] for i in indices}