diff --git a/python/lsst/daf/butler/__init__.py b/python/lsst/daf/butler/__init__.py index 297bf687bf..13e2b0639f 100644 --- a/python/lsst/daf/butler/__init__.py +++ b/python/lsst/daf/butler/__init__.py @@ -13,6 +13,13 @@ from .core import * # Import the registry subpackage directly for other symbols. -from .registry import CollectionSearch, CollectionType, DatasetIdGenEnum, Registry, RegistryConfig +from .registry import ( + CollectionSearch, + CollectionType, + DatasetIdFactory, + DatasetIdGenEnum, + Registry, + RegistryConfig, +) from .transfers import YamlRepoExportBackend, YamlRepoImportBackend from .version import * diff --git a/python/lsst/daf/butler/registries/remote.py b/python/lsst/daf/butler/registries/remote.py index 01e738c101..ba8a4d2124 100644 --- a/python/lsst/daf/butler/registries/remote.py +++ b/python/lsst/daf/butler/registries/remote.py @@ -64,7 +64,7 @@ QueryDimensionRecordsModel, ) from ..registry import CollectionSearch, CollectionType, Registry, RegistryConfig, RegistryDefaults -from ..registry.interfaces import DatasetIdGenEnum +from ..registry.interfaces import DatasetIdFactory, DatasetIdGenEnum from ..registry.summaries import CollectionSummary if TYPE_CHECKING: @@ -120,6 +120,10 @@ def __init__(self, server_uri: ResourcePath, defaults: RegistryDefaults, writeab self._db = server_uri self._defaults = defaults + # In the future DatasetIdFactory may become configurable and this + # instance will need to be shared with datasets manager. + self.datasetIdFactory = DatasetIdFactory() + # All PUT calls should be short-circuited if not writeable. self._writeable = writeable diff --git a/python/lsst/daf/butler/registries/sql.py b/python/lsst/daf/butler/registries/sql.py index 94c78687db..77d9139b9d 100644 --- a/python/lsst/daf/butler/registries/sql.py +++ b/python/lsst/daf/butler/registries/sql.py @@ -74,7 +74,7 @@ RegistryDefaults, queries, ) -from ..registry.interfaces import ChainedCollectionRecord, DatasetIdGenEnum, RunRecord +from ..registry.interfaces import ChainedCollectionRecord, DatasetIdFactory, DatasetIdGenEnum, RunRecord from ..registry.managers import RegistryManagerInstances, RegistryManagerTypes from ..registry.queries import Query from ..registry.summaries import CollectionSummary @@ -205,6 +205,9 @@ def __init__(self, database: Database, defaults: RegistryDefaults, managers: Reg # can only be done after most of the rest of Registry has already been # initialized, and must be done before the property getter is used. self.defaults = defaults + # In the future DatasetIdFactory may become configurable and this + # instance will need to be shared with datasets manager. + self.datasetIdFactory = DatasetIdFactory() def __str__(self) -> str: return str(self._db) diff --git a/python/lsst/daf/butler/registry/__init__.py b/python/lsst/daf/butler/registry/__init__.py index cc2e7818ba..0a6221c9ac 100644 --- a/python/lsst/daf/butler/registry/__init__.py +++ b/python/lsst/daf/butler/registry/__init__.py @@ -26,7 +26,7 @@ from ._defaults import * from ._exceptions import * from ._registry import * -from .interfaces import DatasetIdGenEnum +from .interfaces import DatasetIdFactory, DatasetIdGenEnum from .wildcards import CollectionSearch # Some modules intentionally not imported, either because they are purely diff --git a/python/lsst/daf/butler/registry/_registry.py b/python/lsst/daf/butler/registry/_registry.py index 9523bff604..97d33021b4 100644 --- a/python/lsst/daf/butler/registry/_registry.py +++ b/python/lsst/daf/butler/registry/_registry.py @@ -65,7 +65,7 @@ from ._collectionType import CollectionType from ._config import RegistryConfig from ._defaults import RegistryDefaults -from .interfaces import DatasetIdGenEnum +from .interfaces import DatasetIdFactory, DatasetIdGenEnum from .queries import DataCoordinateQueryResults, DatasetQueryResults, DimensionRecordQueryResults from .summaries import CollectionSummary from .wildcards import CollectionSearch @@ -1640,3 +1640,6 @@ def queryDatasetAssociations( storageClasses: StorageClassFactory """All storage classes known to the registry (`StorageClassFactory`). """ + + datasetIdFactory: DatasetIdFactory + """Factory for dataset IDs.""" diff --git a/python/lsst/daf/butler/registry/datasets/byDimensions/_storage.py b/python/lsst/daf/butler/registry/datasets/byDimensions/_storage.py index 02cd2ed23a..a99949cdcc 100644 --- a/python/lsst/daf/butler/registry/datasets/byDimensions/_storage.py +++ b/python/lsst/daf/butler/registry/datasets/byDimensions/_storage.py @@ -3,7 +3,7 @@ __all__ = ("ByDimensionsDatasetRecordStorage",) import uuid -from typing import TYPE_CHECKING, Any, Dict, Iterable, Iterator, List, Optional, Sequence, Set, Tuple +from typing import TYPE_CHECKING, Any, Dict, Iterable, Iterator, List, Optional, Sequence, Set import sqlalchemy from lsst.daf.butler import ( @@ -22,7 +22,7 @@ ConflictingDefinitionError, UnsupportedIdGeneratorError, ) -from lsst.daf.butler.registry.interfaces import DatasetIdGenEnum, DatasetRecordStorage +from lsst.daf.butler.registry.interfaces import DatasetIdFactory, DatasetIdGenEnum, DatasetRecordStorage from ...summaries import GovernorDimensionRestriction from .tables import makeTagTableSpec @@ -645,10 +645,9 @@ class ByDimensionsDatasetRecordStorageUUID(ByDimensionsDatasetRecordStorage): dataset IDs. """ - NS_UUID = uuid.UUID("840b31d9-05cd-5161-b2c8-00d32b280d0f") - """Namespace UUID used for UUID5 generation. Do not change. This was - produced by `uuid.uuid5(uuid.NAMESPACE_DNS, "lsst.org")`. - """ + idMaker = DatasetIdFactory() + """Factory for dataset IDs. In the future this factory may be shared with + other classes (e.g. Registry).""" def insert( self, @@ -669,7 +668,7 @@ def insert( dataIdList.append(dataId) rows.append( { - "id": self._makeDatasetId(run, dataId, idMode), + "id": self.idMaker.makeDatasetId(run.name, self.datasetType, dataId, idMode), "dataset_type_id": self._dataset_type_id, self._runKeyColumn: run.key, } @@ -724,7 +723,9 @@ def import_( # this code supports mixed types or missing IDs. datasetId = dataset.id if isinstance(dataset.id, uuid.UUID) else None if datasetId is None: - datasetId = self._makeDatasetId(run, dataset.dataId, idGenerationMode) + datasetId = self.idMaker.makeDatasetId( + run.name, self.datasetType, dataset.dataId, idGenerationMode + ) dataIds[datasetId] = dataset.dataId governorValues.update_extract(dataset.dataId) @@ -895,50 +896,3 @@ def _validateImport(self, tmp_tags: sqlalchemy.schema.Table, run: RunRecord) -> raise ConflictingDefinitionError( f"Existing dataset type and dataId does not match new dataset: {row._asdict()}" ) - - def _makeDatasetId( - self, run: RunRecord, dataId: DataCoordinate, idGenerationMode: DatasetIdGenEnum - ) -> uuid.UUID: - """Generate dataset ID for a dataset. - - Parameters - ---------- - run : `RunRecord` - The record object describing the RUN collection for the dataset. - dataId : `DataCoordinate` - Expanded data ID for the dataset. - idGenerationMode : `DatasetIdGenEnum` - ID generation option. `~DatasetIdGenEnum.UNIQUE` make a random - UUID4-type ID. `~DatasetIdGenEnum.DATAID_TYPE` makes a - deterministic UUID5-type ID based on a dataset type name and - ``dataId``. `~DatasetIdGenEnum.DATAID_TYPE_RUN` makes a - deterministic UUID5-type ID based on a dataset type name, run - collection name, and ``dataId``. - - Returns - ------- - datasetId : `uuid.UUID` - Dataset identifier. - """ - if idGenerationMode is DatasetIdGenEnum.UNIQUE: - return uuid.uuid4() - else: - # WARNING: If you modify this code make sure that the order of - # items in the `items` list below never changes. - items: List[Tuple[str, str]] = [] - if idGenerationMode is DatasetIdGenEnum.DATAID_TYPE: - items = [ - ("dataset_type", self.datasetType.name), - ] - elif idGenerationMode is DatasetIdGenEnum.DATAID_TYPE_RUN: - items = [ - ("dataset_type", self.datasetType.name), - ("run", run.name), - ] - else: - raise ValueError(f"Unexpected ID generation mode: {idGenerationMode}") - - for name, value in sorted(dataId.byName().items()): - items.append((name, str(value))) - data = ",".join(f"{key}={value}" for key, value in items) - return uuid.uuid5(self.NS_UUID, data) diff --git a/python/lsst/daf/butler/registry/interfaces/_datasets.py b/python/lsst/daf/butler/registry/interfaces/_datasets.py index 0edb0dec45..747df22cb9 100644 --- a/python/lsst/daf/butler/registry/interfaces/_datasets.py +++ b/python/lsst/daf/butler/registry/interfaces/_datasets.py @@ -21,11 +21,12 @@ from __future__ import annotations -__all__ = ("DatasetRecordStorageManager", "DatasetRecordStorage", "DatasetIdGenEnum") +__all__ = ("DatasetRecordStorageManager", "DatasetRecordStorage", "DatasetIdFactory", "DatasetIdGenEnum") import enum +import uuid from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Any, Iterable, Iterator, Optional, Tuple +from typing import TYPE_CHECKING, Any, Iterable, Iterator, List, Optional, Tuple import sqlalchemy.sql @@ -60,6 +61,73 @@ class DatasetIdGenEnum(enum.Enum): """ +class DatasetIdFactory: + """Factory for dataset IDs (UUIDs). + + For now the logic is hard-coded and is controlled by the user-provided + value of `DatasetIdGenEnum`. In the future we may implement a configurable + logic that can guess `DatasetIdGenEnum` value from other parameters. + """ + + NS_UUID = uuid.UUID("840b31d9-05cd-5161-b2c8-00d32b280d0f") + """Namespace UUID used for UUID5 generation. Do not change. This was + produced by `uuid.uuid5(uuid.NAMESPACE_DNS, "lsst.org")`. + """ + + def makeDatasetId( + self, + run: str, + datasetType: DatasetType, + dataId: DataCoordinate, + idGenerationMode: DatasetIdGenEnum, + ) -> uuid.UUID: + """Generate dataset ID for a dataset. + + Parameters + ---------- + run : `str` + Name of the RUN collection for the dataset. + datasetType : `DatasetType` + Dataset type. + dataId : `DataCoordinate` + Expanded data ID for the dataset. + idGenerationMode : `DatasetIdGenEnum` + ID generation option. `~DatasetIdGenEnum.UNIQUE` makes a random + UUID4-type ID. `~DatasetIdGenEnum.DATAID_TYPE` makes a + deterministic UUID5-type ID based on a dataset type name and + ``dataId``. `~DatasetIdGenEnum.DATAID_TYPE_RUN` makes a + deterministic UUID5-type ID based on a dataset type name, run + collection name, and ``dataId``. + + Returns + ------- + datasetId : `uuid.UUID` + Dataset identifier. + """ + if idGenerationMode is DatasetIdGenEnum.UNIQUE: + return uuid.uuid4() + else: + # WARNING: If you modify this code make sure that the order of + # items in the `items` list below never changes. + items: List[Tuple[str, str]] = [] + if idGenerationMode is DatasetIdGenEnum.DATAID_TYPE: + items = [ + ("dataset_type", datasetType.name), + ] + elif idGenerationMode is DatasetIdGenEnum.DATAID_TYPE_RUN: + items = [ + ("dataset_type", datasetType.name), + ("run", run), + ] + else: + raise ValueError(f"Unexpected ID generation mode: {idGenerationMode}") + + for name, value in sorted(dataId.byName().items()): + items.append((name, str(value))) + data = ",".join(f"{key}={value}" for key, value in items) + return uuid.uuid5(self.NS_UUID, data) + + class DatasetRecordStorage(ABC): """An interface that manages the records associated with a particular `DatasetType`. diff --git a/python/lsst/daf/butler/registry/tests/_registry.py b/python/lsst/daf/butler/registry/tests/_registry.py index 73badf5fb3..25b3b55a4a 100644 --- a/python/lsst/daf/butler/registry/tests/_registry.py +++ b/python/lsst/daf/butler/registry/tests/_registry.py @@ -2932,3 +2932,30 @@ def testSkyPixDatasetQueries(self): {data_id}, ) self.assertEqual(set(registry.queryDatasets(dataset_type, collections=run)), {ref}) + + def testDatasetIdFactory(self): + """Simple test for DatasetIdFactory, mostly to catch potential changes + in its API. + """ + registry = self.makeRegistry() + factory = registry.datasetIdFactory + dataset_type = DatasetType( + "datasetType", + dimensions=["detector", "instrument"], + universe=registry.dimensions, + storageClass="int", + ) + run = "run" + data_id = DataCoordinate.standardize(instrument="Cam1", detector=1, graph=dataset_type.dimensions) + + datasetId = factory.makeDatasetId(run, dataset_type, data_id, DatasetIdGenEnum.UNIQUE) + self.assertIsInstance(datasetId, uuid.UUID) + self.assertEquals(datasetId.version, 4) + + datasetId = factory.makeDatasetId(run, dataset_type, data_id, DatasetIdGenEnum.DATAID_TYPE) + self.assertIsInstance(datasetId, uuid.UUID) + self.assertEquals(datasetId.version, 5) + + datasetId = factory.makeDatasetId(run, dataset_type, data_id, DatasetIdGenEnum.DATAID_TYPE_RUN) + self.assertIsInstance(datasetId, uuid.UUID) + self.assertEquals(datasetId.version, 5)