Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: explicit canonical collection #3622

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion backend/layers/common/entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,12 @@ class CollectionMetadata:
links: List[Link]


@dataclass
class CanonicalCollection:
id: CollectionId
originally_published_at: Optional[datetime]
tombstoned: bool

@dataclass
class CollectionVersion:
collection_id: CollectionId
Expand All @@ -178,7 +184,7 @@ class CollectionVersion:
datasets: List[DatasetVersion]
published_at: Optional[datetime]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we already maintain published_at in the CollectionVersion, so really we're only pulling in canonical_collection for tombstoned, right? at that point, should we consider just making tombstone as part of CollectionVersion and updating on deletes?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

correction: published_at in the canonical collection is the date where the collection was first published

While not necessary, changing that canonical collection field name to originally_published_at would be useful to distinguish

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@atolopko-czi what naming convention do you prefer?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this timestamp is in fact the first collection version's published_at value, then I like Nayib's suggestion. These also work {first,initially}_published_at, but I'm at a loss to pick a "best" choice. :)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

originally_published_at is the best one imho. I figured the originally was implicitly implied by the fact that belongs to the canonical, but as a wise man once said, explicit is better than implicit.

created_at: datetime
curator_name: str
canonical_collection: CanonicalCollection


class CollectionLinkType(Enum):
Expand Down
77 changes: 53 additions & 24 deletions backend/layers/persistence/persistence_mock.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from dataclasses import dataclass
from datetime import datetime
import uuid
from backend.layers.persistence.persistence import DatabaseProviderInterface
from typing import Dict, Iterable, List, Optional
from backend.layers.common.entities import (
CanonicalCollection,
CollectionId,
CollectionMetadata,
CollectionVersion,
Expand All @@ -22,6 +24,11 @@
import copy


@dataclass
class _CanonicalCollection:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, might we even call it _PublishedCollectionVersion? As this is internal to the persistence layer (and maybe even just the mock?) I'm not too opinionated here, but just tossing out the name.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this name is fine. Plus, if we manage to do the SQLite thing, we might not even need this mock anymore.

canonical_collection: CanonicalCollection
mapped_version: CollectionVersionId

class DatabaseProviderMock(DatabaseProviderInterface):

"""
Expand All @@ -36,7 +43,7 @@ class DatabaseProviderMock(DatabaseProviderInterface):
"""

# A mapping between canonical collection ids and collection versions.
collections: Dict[str, str]
collections: Dict[str, _CanonicalCollection]

# All the collection versions
collections_versions: Dict[str, CollectionVersion]
Expand All @@ -62,24 +69,40 @@ def _id():
def create_canonical_collection(self, owner: str, collection_metadata: CollectionMetadata) -> CollectionVersion:
collection_id = CollectionId(self._id())
version_id = CollectionVersionId(self._id())
version = CollectionVersion(collection_id, version_id, owner, collection_metadata, None, [], None, datetime.utcnow())
canonical = CanonicalCollection(collection_id, None, False)
version = CollectionVersion(collection_id, version_id, owner, collection_metadata, None, [], None, datetime.utcnow(), canonical)
self.collections_versions[version_id.id] = version
# Don't set mappings here - those will be set when publishing the collection!
return copy.deepcopy(version)

def _update_version_with_canonical(self, version: CollectionVersion):
"""
Private method that returns a version updated with the canonical collection.
This is equivalent to a database double lookup (or join).
Note that for methods that require a table scan, this should be optimized by holding the
canonical_collections table in memory
"""
cc = self.collections.get(version.collection_id.id)
if cc is None:
return version
copied_version = copy.deepcopy(version)
copied_version.canonical_collection = cc.canonical_collection
return copied_version

def get_collection_mapped_version(self, collection_id: CollectionId) -> Optional[CollectionVersion]:
version_id = self.collections.get(collection_id.id)
if version_id is not None:
return copy.deepcopy(self.collections_versions[version_id])
cc = self.collections.get(collection_id.id)
if cc is not None:
version = self.collections_versions[cc.mapped_version.id]
return self._update_version_with_canonical(version)

def get_all_collections_versions(self) -> Iterable[CollectionVersion]: # TODO: add filters if needed
for version in self.collections_versions.values():
yield copy.deepcopy(version)
yield self._update_version_with_canonical(version)

def get_all_mapped_collection_versions(self) -> Iterable[CollectionVersion]: # TODO: add filters if needed
for version_id, collection_version in self.collections_versions.items():
if version_id in self.collections.values():
yield copy.deepcopy(collection_version)
if version_id in [c.mapped_version.id for c in self.collections.values()]:
yield self._update_version_with_canonical(collection_version)

def delete_collection(self, collection_id: CollectionId) -> None:
del self.collections[collection_id.id]
Expand All @@ -91,8 +114,9 @@ def save_collection_publisher_metadata(self, version_id: CollectionVersionId, pu
self.collections_versions[version_id.id].publisher_metadata = copy.deepcopy(publisher_metadata)

def add_collection_version(self, collection_id: CollectionId) -> CollectionVersion:
current_version_id = self.collections[collection_id.id]
current_version = self.collections_versions[current_version_id]
cc = self.collections[collection_id.id]
current_version_id = cc.mapped_version
current_version = self.collections_versions[current_version_id.id]
new_version_id = CollectionVersionId(self._id())
# Note: since datasets are immutable, there is no need to clone datasets here,
# but the list that contains datasets needs to be copied, since it's a pointer.
Expand All @@ -106,7 +130,8 @@ def add_collection_version(self, collection_id: CollectionId) -> CollectionVersi
publisher_metadata=current_version.publisher_metadata,
datasets=new_dataset_list,
published_at=None,
created_at=datetime.utcnow()
created_at=datetime.utcnow(),
canonical_collection=cc.canonical_collection
)
self.collections_versions[new_version_id.id] = collection_version
return copy.deepcopy(collection_version)
Expand All @@ -116,25 +141,34 @@ def delete_collection_version(self, version_id: CollectionVersionId) -> None:
del self.collections_versions[version_id.id]

def get_collection_version(self, version_id: CollectionVersionId) -> CollectionVersion:
return copy.deepcopy(self.collections_versions.get(version_id.id))
version = self.collections_versions.get(version_id.id)
if version is not None:
return self._update_version_with_canonical(version)

def get_all_versions_for_collection(self, collection_id: CollectionId) -> Iterable[CollectionVersion]:
# On a database, will require a secondary index on `collection_id` for an optimized lookup
for collection_version in self.collections_versions.values():
if collection_version.collection_id == collection_id:
yield copy.deepcopy(collection_version)
yield self._update_version_with_canonical(collection_version)

# MAYBE
def finalize_collection_version(self, collection_id: CollectionId, version_id: CollectionVersionId, published_at: Optional[datetime]) -> None:
self.collections[collection_id.id] = version_id.id
self.collections_versions[version_id.id].published_at = datetime.utcnow()
cc = self.collections.get(collection_id.id)
now = datetime.utcnow()
if cc is None:
self.collections[collection_id.id] = _CanonicalCollection(CanonicalCollection(collection_id, now, False), version_id)
else:
new_cc = copy.deepcopy(cc)
new_cc.mapped_version = version_id
self.collections[collection_id.id] = new_cc
self.collections_versions[version_id.id].published_at = now

# OR
def update_collection_version_mapping(self, collection_id: CollectionId, version_id: CollectionVersionId) -> None:
self.collections[collection_id.id] = version_id.id
# def update_collection_version_mapping(self, collection_id: CollectionId, version_id: CollectionVersionId) -> None:
# self.collections[collection_id.id] = version_id.id

def set_collection_version_published_at(self, version_id: CollectionVersionId) -> None:
self.collections_versions[version_id.id].published_at = datetime.utcnow()
# def set_collection_version_published_at(self, version_id: CollectionVersionId) -> None:
# self.collections_versions[version_id.id].published_at = datetime.utcnow()

# END OR

Expand All @@ -145,11 +179,6 @@ def get_dataset(self, dataset_id: DatasetId) -> DatasetVersion:
def get_dataset_version(self, version_id: DatasetVersionId) -> DatasetVersion:
return copy.deepcopy(self.datasets_versions.get(version_id.id))

def get_all_versions_for_collection(self, collection_id: CollectionId) -> Iterable[CollectionVersion]:
for version in self.collections_versions.values():
if version.collection_id == collection_id:
yield version

def get_all_datasets(self) -> Iterable[DatasetVersion]:
"""
For now, this only returns all the active datasets, i.e. the datasets that belong to a published collection
Expand Down
27 changes: 26 additions & 1 deletion tests/unit/backend/layers/business/test_business.py
Original file line number Diff line number Diff line change
Expand Up @@ -746,6 +746,9 @@ def test_create_collection_version_ok(self):
# The new version should not be published (aka Private)
self.assertIsNone(new_version.published_at)

# The canonical collection should be published
self.assertIsNotNone(new_version.canonical_collection.originally_published_at)

# get_collection still retrieves the original version
version = self.business_logic.get_published_collection_version(published_collection.collection_id)
self.assertEqual(version.version_id, published_collection.version_id)
Expand Down Expand Up @@ -812,14 +815,18 @@ def test_publish_version_ok(self):

published_version = self.database_provider.get_collection_version(unpublished_collection.version_id)
self.assertIsNotNone(published_version.published_at) # TODO: ideally, do a date assertion here (requires mocking)
self.assertIsNotNone(published_version.canonical_collection.originally_published_at)
self.assertEqual(published_version.published_at, published_version.canonical_collection.originally_published_at)

# The published and unpublished collection have the same collection_id and version_id
self.assertEqual(published_version.collection_id, unpublished_collection.collection_id)
self.assertEqual(published_version.version_id, unpublished_collection.version_id)

# get_collection retrieves the correct version
version = self.business_logic.get_published_collection_version(unpublished_collection.collection_id)
self.assertEqual(version.version_id, published_version.version_id)
if version: # pylance
self.assertEqual(version.version_id, published_version.version_id)


def test_publish_collection_with_no_datasets_fail(self):
"""
Expand Down Expand Up @@ -995,6 +1002,19 @@ def test_publish_version_with_replaced_dataset_ok(self):
self.assertCountEqual([replaced_dataset_version_id, dataset_id_to_keep], [d.version_id for d in version.datasets])


def test_publish_version_does_not_change_original_published_at_ok(self):
"""
When publishing a collection version, the published_at for the canonical collection
should not change
"""
first_version = self.initialize_published_collection()
second_version = self.business_logic.create_collection_version(first_version.collection_id)
self.business_logic.publish_collection_version(second_version.version_id)

canonical = self.business_logic.get_collection_version(second_version.version_id).canonical_collection
self.assertEqual(canonical.originally_published_at, first_version.published_at)


def test_get_all_collections_published_does_not_retrieve_old_versions(self):
"""
`get_collections` with is_published=True should not return versions that were previously
Expand All @@ -1010,6 +1030,11 @@ def test_get_all_collections_published_does_not_retrieve_old_versions(self):
self.assertEqual(1, len(all_collections))
self.assertEqual(all_collections[0].version_id, second_version.version_id)

# The canonical collection published_at should point to the original publication time
self.assertNotEqual(all_collections[0].canonical_collection.originally_published_at, all_collections[0].published_at)
self.assertEqual(all_collections[0].canonical_collection.originally_published_at, first_version.published_at)


def test_get_collection_versions_for_canonical_ok(self):
"""
`get_collection_versions_from_canonical` can be used to retrieve all the versions
Expand Down