Merge branch 'main' into better_tiled_dataset_plot

DKISTDC · Jan 30, 2025 · b0190f3 · b0190f3
2 parents 05d5c7b + e34b1c5
commit b0190f3
Show file tree

Hide file tree

Showing 17 changed files with 253 additions and 100 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -32,7 +32,7 @@ jobs:
       jobname:
         type: string
     docker:
-      - image: cimg/python:3.12
+      - image: cimg/python:3.13
     environment:
       TOXENV=<< parameters.jobname >>
     steps:
@@ -54,7 +54,7 @@ jobs:
       jobname:
         type: string
     docker:
-      - image: cimg/python:3.12
+      - image: cimg/python:3.13
     environment:
       TOXENV: << parameters.jobname >>
       GIT_SSH_COMMAND: ssh -i ~/.ssh/id_rsa_7b8fc81c13a3b446ec9aa50d3f626978
@@ -96,16 +96,16 @@ workflows:
           matrix:
             parameters:
               jobname:
-                - "py312-figure"
-                - "py312-figure-devdeps"
+                - "py313-figure"
+                - "py313-figure-devdeps"
 
       - deploy-reference-images:
           name: baseline-<< matrix.jobname >>
           matrix:
             parameters:
               jobname:
-                - "py312-figure"
-                - "py312-figure-devdeps"
+                - "py313-figure"
+                - "py313-figure-devdeps"
           requires:
             - << matrix.jobname >>
           filters:

diff --git a/.cruft.json b/.cruft.json
@@ -1,6 +1,6 @@
 {
   "template": "https://github.com/sunpy/package-template",
-  "commit": "ff0522bc171a1fc63022ed2a371f70669173012e",
+  "commit": "37ffb52646450caa4de8ea084725dbff65fe0995",
   "checkout": null,
   "context": {
     "cookiecutter": {
@@ -32,7 +32,7 @@
         ".github/workflows/sub_package_update.yml"
       ],
       "_template": "https://github.com/sunpy/package-template",
-      "_commit": "ff0522bc171a1fc63022ed2a371f70669173012e"
+      "_commit": "37ffb52646450caa4de8ea084725dbff65fe0995"
     }
   },
   "directory": null

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -33,22 +33,22 @@ jobs:
   tests:
     uses: OpenAstronomy/github-actions-workflows/.github/workflows/tox.yml@main
     with:
-      default_python: '3.10'
+      default_python: '3.13'
       coverage: 'codecov'
       posargs: '--color=yes'
       envs: |
+        - linux: py313
         - linux: py312
-        - linux: py311
-        - windows: py311-online
-        - macos: py310
-        - linux: py310-oldestdeps
+        - windows: py312-online
+        - macos: py311
+        - linux: py311-oldestdeps
     secrets:
       CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
 
   docs:
     uses: OpenAstronomy/github-actions-workflows/.github/workflows/tox.yml@main
     with:
-      default_python: '3.10'
+      default_python: '3.12'
       coverage: 'codecov'
       envs: |
         - linux: build_docs-notebooks
@@ -65,7 +65,7 @@ jobs:
       - uses: actions/checkout@v4
       - uses: actions/setup-python@v5
         with:
-          python-version: '3.10'
+          python-version: '3.13'
       - run: python -m pip install -U --user build
       - run: python -m build . --sdist
       - run: python -m pip install -U --user twine
@@ -77,7 +77,7 @@ jobs:
       coverage: 'codecov'
       posargs: '--color=yes'
       envs: |
-        - linux: py311-devdeps
+        - linux: py313-devdeps
     secrets:
       CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
 
@@ -90,7 +90,7 @@ jobs:
       )
     uses: OpenAstronomy/github-actions-workflows/.github/workflows/publish_pure_python.yml@main
     with:
-      python-version: '3.10'
+      python-version: '3.13'
       test_extras: tests
       test_command: pytest --pyargs dkist -k "not test_fail"
       # We have to work around a github runner bug here: https://github.com/actions/runner/issues/2788#issuecomment-2145922705

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,7 +1,7 @@
 repos:
     # This should be before any formatting hooks like isort
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: "v0.9.1"
+    rev: "v0.9.2"
     hooks:
       - id: ruff
         args: ["--fix"]
@@ -26,7 +26,7 @@ repos:
       - id: mixed-line-ending
         exclude: ".*(.fits|.fts|.fit|.header|.txt|tca.*|.asdf|.json|.hdr)$"
   - repo: https://github.com/codespell-project/codespell
-    rev: v2.3.0
+    rev: v2.4.0
     hooks:
       - id: codespell
         args: [ "--write-changes", "-D", "-", "-D", ".codespell-dict.txt"]

diff --git a/.rtd-environment.yml b/.rtd-environment.yml
@@ -2,6 +2,6 @@ name: dkist
 channels:
   - conda-forge
 dependencies:
-  - python=3.12
+  - python=3.13
   - pip
   - graphviz!=2.42.*,!=2.43.*
diff --git a/changelog/491.feature.rst b/changelog/491.feature.rst
@@ -1 +1 @@
-Add a ``fig=`` keyword argument to `TiledDataset.plot` and make it default to the current figure.
+Add a ``figure=`` keyword argument to `TiledDataset.plot` and make it default to the current figure.
diff --git a/changelog/503.bugfix.rst b/changelog/503.bugfix.rst
@@ -0,0 +1 @@
+Improve the ASDF detection code so out of date ASDF filenames generated by the DKIST data center are skipped if a newer filename is present.
diff --git a/changelog/507.breaking.rst b/changelog/507.breaking.rst
@@ -0,0 +1,9 @@
+The minimum supported versions of dependencies and Python have been updated, this release requires:
+* Python 3.11
+* asdf 2.15 (and plugin version bumps)
+* dask 2023.2
+* matplotlib 3.7
+* ndcube 2.1
+* numpy 1.25
+* parfive 2.1
+* sunpy 5.0
diff --git a/dkist/dataset/loader.py b/dkist/dataset/loader.py
@@ -1,11 +1,16 @@
+import re
+import warnings
 import importlib.resources as importlib_resources
 from pathlib import Path
 from functools import singledispatch
+from collections import defaultdict
 
 from parfive import Results
 
 import asdf
 
+from dkist.utils.exceptions import DKISTUserWarning
+
 try:
     # first try to import from asdf.exceptions for asdf 2.15+
     from asdf.exceptions import ValidationError
@@ -14,6 +19,9 @@
     from asdf import ValidationError
 
 
+ASDF_FILENAME_PATTERN = r"^(?P<instrument>[A-Z-]+)_L1_(?P<timestamp>\d{8}T\d{6})_(?P<datasetid>[A-Z]{5,})(?P<suffix>_user_tools|_metadata)?.asdf$"
+
+
 def asdf_open_memory_mapping_kwarg(memmap: bool) -> dict:
     if asdf.__version__ > "3.1.0":
         return {"memmap": memmap}
@@ -138,21 +146,91 @@ def _load_from_path(path: Path):
 
 def _load_from_directory(directory):
     """
-    Construct a `~dkist.dataset.Dataset` from a directory containing one
-    asdf file and a collection of FITS files.
+    Construct a `~dkist.dataset.Dataset` from a directory containing one (or
+    more) ASDF files and a collection of FITS files.
+
+    ASDF files have the generic pattern:
+
+    ``{instrument}_L1_{start_time:%Y%m%dT%H%M%S}_{dataset_id}[_{suffix}].asdf``
+
+    where the ``_{suffix}`` on the end may be absent or one of a few different
+    suffixes which have been used at different times.  When searching a
+    directory for one or more ASDF file to load we should attempt to only load
+    one per dataset ID by selecting files in suffix order.
+
+    The order of suffixes are (from newest used to oldest):
+
+    - ``_metadata``
+    - ``_user_tools``
+    - None
+
+    The algorithm used to find ASDF files to load in a directory is therefore:
+
+    - Glob the directory for all ASDF files
+    - Group all results by the filename up to and including the dataset id in the filename
+    - Ignore any ASDF files with an old suffix if a new suffix is present
+    - Throw a warning to the user if any ASDF files with older suffixes are found
     """
     base_path = Path(directory).expanduser()
     asdf_files = tuple(base_path.glob("*.asdf"))
 
     if not asdf_files:
         raise ValueError(f"No asdf file found in directory {base_path}.")
 
-    if len(asdf_files) > 1:
-        return _load_from_iterable(asdf_files)
-
-    asdf_file = asdf_files[0]
+    if len(asdf_files) == 1:
+        return _load_from_asdf(asdf_files[0])
+
+    pattern = re.compile(ASDF_FILENAME_PATTERN)
+    candidates = []
+    asdfs_to_load = []
+    for filepath in asdf_files:
+        filename = filepath.name
+
+        # If the asdf file doesn't match the data center pattern then we load it
+        # as it's probably a custom user file
+        if pattern.match(filename) is None:
+            asdfs_to_load.append(filepath)
+            continue
+
+        # All the matches have to be checked
+        candidates.append(filepath)
+
+    # If we only have one match load it
+    if len(candidates) == 1:
+        asdfs_to_load += candidates
+    else:
+        # Now we group by prefix
+        matches = [pattern.match(fp.name) for fp in candidates]
+        grouped = defaultdict(list)
+        for m in matches:
+            prefix = m.string.removesuffix(".asdf").removesuffix(m.group("suffix") or "")
+            grouped[prefix].append(m.group("suffix"))
+
+        # Now we select the best suffix for each prefix
+        for prefix, suffixes in grouped.items():
+            if "_metadata" in suffixes:
+                asdfs_to_load.append(base_path / f"{prefix}_metadata.asdf")
+            elif "_user_tools" in suffixes:
+                asdfs_to_load.append(base_path / f"{prefix}_user_tools.asdf")
+            elif None in suffixes:
+                asdfs_to_load.append(base_path / f"{prefix}.asdf")
+            else:
+                # This branch should never be hit because the regex enumerates the suffixes
+                raise ValueError("Unknown suffix encountered.")  # pragma: no cover
+
+    # Throw a warning if we have skipped any files
+    if ignored_files := set(asdf_files).difference(asdfs_to_load):
+        warnings.warn(
+            f"ASDF files with old names ({', '.join([a.name for a in ignored_files])}) "
+            "were found in this directory and ignored. You may want to delete these files.",
+            DKISTUserWarning
+        )
+
+    if len(asdfs_to_load) == 1:
+        return _load_from_asdf(asdfs_to_load[0])
+
+    return _load_from_iterable(asdfs_to_load)
 
-    return _load_from_asdf(asdf_file)
 
 
 def _load_from_asdf(filepath):

diff --git a/dkist/dataset/tests/test_load_dataset.py b/dkist/dataset/tests/test_load_dataset.py
@@ -1,4 +1,6 @@
+import re
 import shutil
+import numbers
 
 import pytest
 from parfive import Results
@@ -7,6 +9,8 @@
 
 from dkist import Dataset, TiledDataset, load_dataset
 from dkist.data.test import rootdir
+from dkist.dataset.loader import ASDF_FILENAME_PATTERN
+from dkist.utils.exceptions import DKISTUserWarning
 
 
 @pytest.fixture
@@ -114,3 +118,86 @@ def test_not_dkist_asdf(tmp_path):
 
     with pytest.raises(TypeError, match="not a valid DKIST"):
         load_dataset(tmp_path / "test.asdf")
+
+
+def generate_asdf_folder(tmp_path, asdf_path, filenames):
+    for fname in filenames:
+        shutil.copy(asdf_path, tmp_path / fname)
+
+    return tmp_path
+
+
+@pytest.mark.parametrize(("filename", "match"), [
+    ("VBI_L1_20231016T184519_AJQWW.asdf", True),
+    ("VBI_L1_20231016T184519_AAAA.asdf", False),
+    ("VBI_L1_20231016T184519_AJQWW_user_tools.asdf", True),
+    ("VBI_L1_20231016T184519_AJQWW_metadata.asdf", True),
+    ("DL-NIRSP_L1_20231016T184519_AJQWW.asdf", True),
+    ("DL-NIRSP_L1_20231016T184519_AJQWW_user_tools.asdf", True),
+    ("DL-NIRSP_L1_20231016T184519_AJQWW_metadata.asdf", True),
+    ("VISP_L1_99999999T184519_AAAAAAA.asdf", True),
+    ("VISP_L1_20231016T888888_AAAAAAA_user_tools.asdf", True),
+    ("VISP_L1_20231016T184519_AAAAAAA_metadata.asdf", True),
+    ("VISP_L1_20231016T184519_AAAAAAA_unknown.asdf", False),
+    ("VISP_L1_20231016T184519.asdf", False),
+    ("wibble.asdf", False),
+    ])
+def test_asdf_regex(filename, match):
+    m = re.match(ASDF_FILENAME_PATTERN, filename)
+    assert bool(m) is match
+
+
+@pytest.mark.parametrize(("filenames", "indices"), [
+    pytest.param(("VBI_L1_20231016T184519_AJQWW.asdf",), 0, id="Single no suffix"),
+    pytest.param(("VBI_L1_20231016T184519_AJQWW_user_tools.asdf",), 0, id="single _user_tools"),
+    pytest.param(("VBI_L1_20231016T184519_AJQWW_metadata.asdf",), 0, id="single _metadata"),
+    pytest.param(("VBI_L1_20231016T184519_AJQWW_unknown.asdf",), 0, id="single _unknown"),
+    pytest.param(("VBI_L1_20231016T184519_AJQWW.asdf",
+                  "VBI_L1_20231016T184519_AJQWW_user_tools.asdf",), 1, id="none & _user_tools"),
+    pytest.param(("VBI_L1_20231016T184519_AJQWW.asdf",
+                  "VBI_L1_20231016T184519_AJQWW_user_tools.asdf",
+                  "VBI_L1_20231016T184519_AJQWW_metadata.asdf",), 2, id="_user_tools & _metadata"),
+    pytest.param(("VBI_L1_20231016T184519_AJQWW.asdf",
+                  "VBI_L1_20231016T184519_AJQWW_user_tools.asdf",
+                  "VBI_L1_20231016T184519_AJQWW_metadata.asdf",
+                  "VBI_L1_20231016T184519_AJQWW_unknown.asdf"), (2, 3), id="_user_tools & _metadata & _unknown"),
+    pytest.param(("random.asdf",
+                  "VBI_L1_20231016T184519_AJQWW_user_tools.asdf",), (0, 1), id="other pattern & _user_tools"),
+    pytest.param(("random.asdf",
+                  "VBI_L1_not_a_proper_name.asdf",
+                  "VBI_L1_20231016T184519_AJQWW_user_tools.asdf",
+                  "VBI_L1_20231016T184519_AJQWW_metadata.asdf",), (0, 1, 3), id="2 other patterns & _user_tools & _metadata"),
+    pytest.param(("VBI_L1_20231016T184519_AJQWW.asdf",
+                  "VISP_L1_20231016T184519_AJQWW.asdf",), (0, 1), id="Two patterns, no suffix"),
+    pytest.param(("VBI_L1_20231016T184519_AAAAA.asdf",
+                  "VBI_L1_20231016T184519_AAAAA_metadata.asdf",
+                  "VBI_L1_20231116T184519_BBBBBBB.asdf",
+                  "VBI_L1_20231216T184519_CCCCCCC.asdf",
+                  "VBI_L1_20231216T184519_CCCCCCC_user_tools.asdf"), (1, 2, 4), id="Three patterns, mixed suffixes"),
+])
+def test_select_asdf(tmp_path, asdf_path, filenames, indices, mocker):
+    asdf_folder = generate_asdf_folder(tmp_path, asdf_path, filenames)
+
+    asdf_file_paths = tuple(asdf_folder / fname for fname in filenames)
+
+    load_from_asdf = mocker.patch("dkist.dataset.loader._load_from_asdf")
+    load_from_iterable = mocker.patch("dkist.dataset.loader._load_from_iterable")
+
+    # The load_dataset call should throw a warning if any files are skipped, but
+    # not otherwise, the warning should have the filenames of any skipped files in
+    tuple_of_indices = indices if isinstance(indices, tuple) else (indices,)
+    if len(tuple_of_indices) == len(filenames):
+        datasets = load_dataset(asdf_folder)
+    else:
+        files_to_be_skipped = set(filenames).difference([filenames[i] for i in tuple_of_indices])
+        with pytest.warns(DKISTUserWarning, match=f".*[{'|'.join([re.escape(f) for f in files_to_be_skipped])}].*"):
+            datasets = load_dataset(asdf_folder)
+
+    if isinstance(indices, numbers.Integral):
+        load_from_asdf.assert_called_once_with(asdf_file_paths[indices])
+    else:
+        calls = load_from_iterable.mock_calls
+        # We need to assert that _load_from_iterable is called with the right
+        # paths but in a order-invariant way.
+        assert len(calls) == 1
+        assert set(calls[0].args[0]) == {asdf_file_paths[i] for i in indices}
diff --git a/dkist/dataset/tests/test_tiled_dataset.py b/dkist/dataset/tests/test_tiled_dataset.py
@@ -89,7 +89,7 @@ def test_tileddataset_plot(share_zscale):
     ds = TiledDataset(np.array(newtiles).reshape(ori_ds.shape), inventory=newtiles[0].inventory)
 
     fig = plt.figure(figsize=(12, 15))
-    ds.plot(0, share_zscale=share_zscale, fig=fig)
+    ds.plot(0, share_zscale=share_zscale, figure=fig)
 
     return plt.gcf()
 
@@ -118,7 +118,7 @@ def test_tileddataset_plot_limit_swapping(swap_tile_limits):
     assert non_square_ds.shape[0] != non_square_ds.shape[1]  # Just in case the underlying data change for some reason
 
     fig = plt.figure(figsize=(12, 15))
-    non_square_ds.plot(0, share_zscale=False, swap_tile_limits=swap_tile_limits, fig=fig)
+    non_square_ds.plot(0, share_zscale=False, swap_tile_limits=swap_tile_limits, figure=fig)
 
     assert fig.axes[0].get_gridspec().get_geometry() == non_square_ds.shape[::-1]
     for ax in fig.axes:
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		Add a ``fig=`` keyword argument to `TiledDataset.plot` and make it default to the current figure.
		Add a ``figure=`` keyword argument to `TiledDataset.plot` and make it default to the current figure.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Improve the ASDF detection code so out of date ASDF filenames generated by the DKIST data center are skipped if a newer filename is present.