Skip to content

Commit

Permalink
Merge pull request #709 from lsst/tickets/DM-33492
Browse files Browse the repository at this point in the history
Move dataset UUID generation code to a public class.
  • Loading branch information
andy-slac authored Aug 11, 2022
2 parents b0940af + d435dba commit 3aec807
Show file tree
Hide file tree
Showing 8 changed files with 128 additions and 62 deletions.
9 changes: 8 additions & 1 deletion python/lsst/daf/butler/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,13 @@
from .core import *

# Import the registry subpackage directly for other symbols.
from .registry import CollectionSearch, CollectionType, DatasetIdGenEnum, Registry, RegistryConfig
from .registry import (
CollectionSearch,
CollectionType,
DatasetIdFactory,
DatasetIdGenEnum,
Registry,
RegistryConfig,
)
from .transfers import YamlRepoExportBackend, YamlRepoImportBackend
from .version import *
6 changes: 5 additions & 1 deletion python/lsst/daf/butler/registries/remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@
QueryDimensionRecordsModel,
)
from ..registry import CollectionSearch, CollectionType, Registry, RegistryConfig, RegistryDefaults
from ..registry.interfaces import DatasetIdGenEnum
from ..registry.interfaces import DatasetIdFactory, DatasetIdGenEnum
from ..registry.summaries import CollectionSummary

if TYPE_CHECKING:
Expand Down Expand Up @@ -120,6 +120,10 @@ def __init__(self, server_uri: ResourcePath, defaults: RegistryDefaults, writeab
self._db = server_uri
self._defaults = defaults

# In the future DatasetIdFactory may become configurable and this
# instance will need to be shared with datasets manager.
self.datasetIdFactory = DatasetIdFactory()

# All PUT calls should be short-circuited if not writeable.
self._writeable = writeable

Expand Down
5 changes: 4 additions & 1 deletion python/lsst/daf/butler/registries/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@
RegistryDefaults,
queries,
)
from ..registry.interfaces import ChainedCollectionRecord, DatasetIdGenEnum, RunRecord
from ..registry.interfaces import ChainedCollectionRecord, DatasetIdFactory, DatasetIdGenEnum, RunRecord
from ..registry.managers import RegistryManagerInstances, RegistryManagerTypes
from ..registry.queries import Query
from ..registry.summaries import CollectionSummary
Expand Down Expand Up @@ -205,6 +205,9 @@ def __init__(self, database: Database, defaults: RegistryDefaults, managers: Reg
# can only be done after most of the rest of Registry has already been
# initialized, and must be done before the property getter is used.
self.defaults = defaults
# In the future DatasetIdFactory may become configurable and this
# instance will need to be shared with datasets manager.
self.datasetIdFactory = DatasetIdFactory()

def __str__(self) -> str:
return str(self._db)
Expand Down
2 changes: 1 addition & 1 deletion python/lsst/daf/butler/registry/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from ._defaults import *
from ._exceptions import *
from ._registry import *
from .interfaces import DatasetIdGenEnum
from .interfaces import DatasetIdFactory, DatasetIdGenEnum
from .wildcards import CollectionSearch

# Some modules intentionally not imported, either because they are purely
Expand Down
5 changes: 4 additions & 1 deletion python/lsst/daf/butler/registry/_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@
from ._collectionType import CollectionType
from ._config import RegistryConfig
from ._defaults import RegistryDefaults
from .interfaces import DatasetIdGenEnum
from .interfaces import DatasetIdFactory, DatasetIdGenEnum
from .queries import DataCoordinateQueryResults, DatasetQueryResults, DimensionRecordQueryResults
from .summaries import CollectionSummary
from .wildcards import CollectionSearch
Expand Down Expand Up @@ -1640,3 +1640,6 @@ def queryDatasetAssociations(
storageClasses: StorageClassFactory
"""All storage classes known to the registry (`StorageClassFactory`).
"""

datasetIdFactory: DatasetIdFactory
"""Factory for dataset IDs."""
64 changes: 9 additions & 55 deletions python/lsst/daf/butler/registry/datasets/byDimensions/_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
__all__ = ("ByDimensionsDatasetRecordStorage",)

import uuid
from typing import TYPE_CHECKING, Any, Dict, Iterable, Iterator, List, Optional, Sequence, Set, Tuple
from typing import TYPE_CHECKING, Any, Dict, Iterable, Iterator, List, Optional, Sequence, Set

import sqlalchemy
from lsst.daf.butler import (
Expand All @@ -22,7 +22,7 @@
ConflictingDefinitionError,
UnsupportedIdGeneratorError,
)
from lsst.daf.butler.registry.interfaces import DatasetIdGenEnum, DatasetRecordStorage
from lsst.daf.butler.registry.interfaces import DatasetIdFactory, DatasetIdGenEnum, DatasetRecordStorage

from ...summaries import GovernorDimensionRestriction
from .tables import makeTagTableSpec
Expand Down Expand Up @@ -645,10 +645,9 @@ class ByDimensionsDatasetRecordStorageUUID(ByDimensionsDatasetRecordStorage):
dataset IDs.
"""

NS_UUID = uuid.UUID("840b31d9-05cd-5161-b2c8-00d32b280d0f")
"""Namespace UUID used for UUID5 generation. Do not change. This was
produced by `uuid.uuid5(uuid.NAMESPACE_DNS, "lsst.org")`.
"""
idMaker = DatasetIdFactory()
"""Factory for dataset IDs. In the future this factory may be shared with
other classes (e.g. Registry)."""

def insert(
self,
Expand All @@ -669,7 +668,7 @@ def insert(
dataIdList.append(dataId)
rows.append(
{
"id": self._makeDatasetId(run, dataId, idMode),
"id": self.idMaker.makeDatasetId(run.name, self.datasetType, dataId, idMode),
"dataset_type_id": self._dataset_type_id,
self._runKeyColumn: run.key,
}
Expand Down Expand Up @@ -724,7 +723,9 @@ def import_(
# this code supports mixed types or missing IDs.
datasetId = dataset.id if isinstance(dataset.id, uuid.UUID) else None
if datasetId is None:
datasetId = self._makeDatasetId(run, dataset.dataId, idGenerationMode)
datasetId = self.idMaker.makeDatasetId(
run.name, self.datasetType, dataset.dataId, idGenerationMode
)
dataIds[datasetId] = dataset.dataId
governorValues.update_extract(dataset.dataId)

Expand Down Expand Up @@ -895,50 +896,3 @@ def _validateImport(self, tmp_tags: sqlalchemy.schema.Table, run: RunRecord) ->
raise ConflictingDefinitionError(
f"Existing dataset type and dataId does not match new dataset: {row._asdict()}"
)

def _makeDatasetId(
self, run: RunRecord, dataId: DataCoordinate, idGenerationMode: DatasetIdGenEnum
) -> uuid.UUID:
"""Generate dataset ID for a dataset.
Parameters
----------
run : `RunRecord`
The record object describing the RUN collection for the dataset.
dataId : `DataCoordinate`
Expanded data ID for the dataset.
idGenerationMode : `DatasetIdGenEnum`
ID generation option. `~DatasetIdGenEnum.UNIQUE` make a random
UUID4-type ID. `~DatasetIdGenEnum.DATAID_TYPE` makes a
deterministic UUID5-type ID based on a dataset type name and
``dataId``. `~DatasetIdGenEnum.DATAID_TYPE_RUN` makes a
deterministic UUID5-type ID based on a dataset type name, run
collection name, and ``dataId``.
Returns
-------
datasetId : `uuid.UUID`
Dataset identifier.
"""
if idGenerationMode is DatasetIdGenEnum.UNIQUE:
return uuid.uuid4()
else:
# WARNING: If you modify this code make sure that the order of
# items in the `items` list below never changes.
items: List[Tuple[str, str]] = []
if idGenerationMode is DatasetIdGenEnum.DATAID_TYPE:
items = [
("dataset_type", self.datasetType.name),
]
elif idGenerationMode is DatasetIdGenEnum.DATAID_TYPE_RUN:
items = [
("dataset_type", self.datasetType.name),
("run", run.name),
]
else:
raise ValueError(f"Unexpected ID generation mode: {idGenerationMode}")

for name, value in sorted(dataId.byName().items()):
items.append((name, str(value)))
data = ",".join(f"{key}={value}" for key, value in items)
return uuid.uuid5(self.NS_UUID, data)
72 changes: 70 additions & 2 deletions python/lsst/daf/butler/registry/interfaces/_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,12 @@

from __future__ import annotations

__all__ = ("DatasetRecordStorageManager", "DatasetRecordStorage", "DatasetIdGenEnum")
__all__ = ("DatasetRecordStorageManager", "DatasetRecordStorage", "DatasetIdFactory", "DatasetIdGenEnum")

import enum
import uuid
from abc import ABC, abstractmethod
from typing import TYPE_CHECKING, Any, Iterable, Iterator, Optional, Tuple
from typing import TYPE_CHECKING, Any, Iterable, Iterator, List, Optional, Tuple

import sqlalchemy.sql

Expand Down Expand Up @@ -60,6 +61,73 @@ class DatasetIdGenEnum(enum.Enum):
"""


class DatasetIdFactory:
"""Factory for dataset IDs (UUIDs).
For now the logic is hard-coded and is controlled by the user-provided
value of `DatasetIdGenEnum`. In the future we may implement a configurable
logic that can guess `DatasetIdGenEnum` value from other parameters.
"""

NS_UUID = uuid.UUID("840b31d9-05cd-5161-b2c8-00d32b280d0f")
"""Namespace UUID used for UUID5 generation. Do not change. This was
produced by `uuid.uuid5(uuid.NAMESPACE_DNS, "lsst.org")`.
"""

def makeDatasetId(
self,
run: str,
datasetType: DatasetType,
dataId: DataCoordinate,
idGenerationMode: DatasetIdGenEnum,
) -> uuid.UUID:
"""Generate dataset ID for a dataset.
Parameters
----------
run : `str`
Name of the RUN collection for the dataset.
datasetType : `DatasetType`
Dataset type.
dataId : `DataCoordinate`
Expanded data ID for the dataset.
idGenerationMode : `DatasetIdGenEnum`
ID generation option. `~DatasetIdGenEnum.UNIQUE` makes a random
UUID4-type ID. `~DatasetIdGenEnum.DATAID_TYPE` makes a
deterministic UUID5-type ID based on a dataset type name and
``dataId``. `~DatasetIdGenEnum.DATAID_TYPE_RUN` makes a
deterministic UUID5-type ID based on a dataset type name, run
collection name, and ``dataId``.
Returns
-------
datasetId : `uuid.UUID`
Dataset identifier.
"""
if idGenerationMode is DatasetIdGenEnum.UNIQUE:
return uuid.uuid4()
else:
# WARNING: If you modify this code make sure that the order of
# items in the `items` list below never changes.
items: List[Tuple[str, str]] = []
if idGenerationMode is DatasetIdGenEnum.DATAID_TYPE:
items = [
("dataset_type", datasetType.name),
]
elif idGenerationMode is DatasetIdGenEnum.DATAID_TYPE_RUN:
items = [
("dataset_type", datasetType.name),
("run", run),
]
else:
raise ValueError(f"Unexpected ID generation mode: {idGenerationMode}")

for name, value in sorted(dataId.byName().items()):
items.append((name, str(value)))
data = ",".join(f"{key}={value}" for key, value in items)
return uuid.uuid5(self.NS_UUID, data)


class DatasetRecordStorage(ABC):
"""An interface that manages the records associated with a particular
`DatasetType`.
Expand Down
27 changes: 27 additions & 0 deletions python/lsst/daf/butler/registry/tests/_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -2932,3 +2932,30 @@ def testSkyPixDatasetQueries(self):
{data_id},
)
self.assertEqual(set(registry.queryDatasets(dataset_type, collections=run)), {ref})

def testDatasetIdFactory(self):
"""Simple test for DatasetIdFactory, mostly to catch potential changes
in its API.
"""
registry = self.makeRegistry()
factory = registry.datasetIdFactory
dataset_type = DatasetType(
"datasetType",
dimensions=["detector", "instrument"],
universe=registry.dimensions,
storageClass="int",
)
run = "run"
data_id = DataCoordinate.standardize(instrument="Cam1", detector=1, graph=dataset_type.dimensions)

datasetId = factory.makeDatasetId(run, dataset_type, data_id, DatasetIdGenEnum.UNIQUE)
self.assertIsInstance(datasetId, uuid.UUID)
self.assertEquals(datasetId.version, 4)

datasetId = factory.makeDatasetId(run, dataset_type, data_id, DatasetIdGenEnum.DATAID_TYPE)
self.assertIsInstance(datasetId, uuid.UUID)
self.assertEquals(datasetId.version, 5)

datasetId = factory.makeDatasetId(run, dataset_type, data_id, DatasetIdGenEnum.DATAID_TYPE_RUN)
self.assertIsInstance(datasetId, uuid.UUID)
self.assertEquals(datasetId.version, 5)

0 comments on commit 3aec807

Please sign in to comment.