-
Notifications
You must be signed in to change notification settings - Fork 14
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: explicit canonical collection #3622
Changes from all commits
4ca7937
b372b53
2bfc69c
bc9b7df
d85d043
f7aac2d
fc19054
e4d6afa
f71f85f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,10 @@ | ||
from dataclasses import dataclass | ||
from datetime import datetime | ||
import uuid | ||
from backend.layers.persistence.persistence import DatabaseProviderInterface | ||
from typing import Dict, Iterable, List, Optional | ||
from backend.layers.common.entities import ( | ||
CanonicalCollection, | ||
CollectionId, | ||
CollectionMetadata, | ||
CollectionVersion, | ||
|
@@ -22,6 +24,11 @@ | |
import copy | ||
|
||
|
||
@dataclass | ||
class _CanonicalCollection: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually, might we even call it There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this name is fine. Plus, if we manage to do the SQLite thing, we might not even need this mock anymore. |
||
canonical_collection: CanonicalCollection | ||
mapped_version: CollectionVersionId | ||
|
||
class DatabaseProviderMock(DatabaseProviderInterface): | ||
|
||
""" | ||
|
@@ -36,7 +43,7 @@ class DatabaseProviderMock(DatabaseProviderInterface): | |
""" | ||
|
||
# A mapping between canonical collection ids and collection versions. | ||
collections: Dict[str, str] | ||
collections: Dict[str, _CanonicalCollection] | ||
|
||
# All the collection versions | ||
collections_versions: Dict[str, CollectionVersion] | ||
|
@@ -62,24 +69,40 @@ def _id(): | |
def create_canonical_collection(self, owner: str, collection_metadata: CollectionMetadata) -> CollectionVersion: | ||
collection_id = CollectionId(self._id()) | ||
version_id = CollectionVersionId(self._id()) | ||
version = CollectionVersion(collection_id, version_id, owner, collection_metadata, None, [], None, datetime.utcnow()) | ||
canonical = CanonicalCollection(collection_id, None, False) | ||
version = CollectionVersion(collection_id, version_id, owner, collection_metadata, None, [], None, datetime.utcnow(), canonical) | ||
self.collections_versions[version_id.id] = version | ||
# Don't set mappings here - those will be set when publishing the collection! | ||
return copy.deepcopy(version) | ||
|
||
def _update_version_with_canonical(self, version: CollectionVersion): | ||
""" | ||
Private method that returns a version updated with the canonical collection. | ||
This is equivalent to a database double lookup (or join). | ||
Note that for methods that require a table scan, this should be optimized by holding the | ||
canonical_collections table in memory | ||
""" | ||
cc = self.collections.get(version.collection_id.id) | ||
if cc is None: | ||
return version | ||
copied_version = copy.deepcopy(version) | ||
copied_version.canonical_collection = cc.canonical_collection | ||
return copied_version | ||
|
||
def get_collection_mapped_version(self, collection_id: CollectionId) -> Optional[CollectionVersion]: | ||
version_id = self.collections.get(collection_id.id) | ||
if version_id is not None: | ||
return copy.deepcopy(self.collections_versions[version_id]) | ||
cc = self.collections.get(collection_id.id) | ||
if cc is not None: | ||
version = self.collections_versions[cc.mapped_version.id] | ||
return self._update_version_with_canonical(version) | ||
|
||
def get_all_collections_versions(self) -> Iterable[CollectionVersion]: # TODO: add filters if needed | ||
for version in self.collections_versions.values(): | ||
yield copy.deepcopy(version) | ||
yield self._update_version_with_canonical(version) | ||
|
||
def get_all_mapped_collection_versions(self) -> Iterable[CollectionVersion]: # TODO: add filters if needed | ||
for version_id, collection_version in self.collections_versions.items(): | ||
if version_id in self.collections.values(): | ||
yield copy.deepcopy(collection_version) | ||
if version_id in [c.mapped_version.id for c in self.collections.values()]: | ||
yield self._update_version_with_canonical(collection_version) | ||
|
||
def delete_collection(self, collection_id: CollectionId) -> None: | ||
del self.collections[collection_id.id] | ||
|
@@ -91,8 +114,9 @@ def save_collection_publisher_metadata(self, version_id: CollectionVersionId, pu | |
self.collections_versions[version_id.id].publisher_metadata = copy.deepcopy(publisher_metadata) | ||
|
||
def add_collection_version(self, collection_id: CollectionId) -> CollectionVersion: | ||
current_version_id = self.collections[collection_id.id] | ||
current_version = self.collections_versions[current_version_id] | ||
cc = self.collections[collection_id.id] | ||
current_version_id = cc.mapped_version | ||
current_version = self.collections_versions[current_version_id.id] | ||
new_version_id = CollectionVersionId(self._id()) | ||
# Note: since datasets are immutable, there is no need to clone datasets here, | ||
# but the list that contains datasets needs to be copied, since it's a pointer. | ||
|
@@ -106,7 +130,8 @@ def add_collection_version(self, collection_id: CollectionId) -> CollectionVersi | |
publisher_metadata=current_version.publisher_metadata, | ||
datasets=new_dataset_list, | ||
published_at=None, | ||
created_at=datetime.utcnow() | ||
created_at=datetime.utcnow(), | ||
canonical_collection=cc.canonical_collection | ||
) | ||
self.collections_versions[new_version_id.id] = collection_version | ||
return copy.deepcopy(collection_version) | ||
|
@@ -116,25 +141,34 @@ def delete_collection_version(self, version_id: CollectionVersionId) -> None: | |
del self.collections_versions[version_id.id] | ||
|
||
def get_collection_version(self, version_id: CollectionVersionId) -> CollectionVersion: | ||
return copy.deepcopy(self.collections_versions.get(version_id.id)) | ||
version = self.collections_versions.get(version_id.id) | ||
if version is not None: | ||
return self._update_version_with_canonical(version) | ||
|
||
def get_all_versions_for_collection(self, collection_id: CollectionId) -> Iterable[CollectionVersion]: | ||
# On a database, will require a secondary index on `collection_id` for an optimized lookup | ||
for collection_version in self.collections_versions.values(): | ||
if collection_version.collection_id == collection_id: | ||
yield copy.deepcopy(collection_version) | ||
yield self._update_version_with_canonical(collection_version) | ||
|
||
# MAYBE | ||
def finalize_collection_version(self, collection_id: CollectionId, version_id: CollectionVersionId, published_at: Optional[datetime]) -> None: | ||
self.collections[collection_id.id] = version_id.id | ||
self.collections_versions[version_id.id].published_at = datetime.utcnow() | ||
cc = self.collections.get(collection_id.id) | ||
now = datetime.utcnow() | ||
if cc is None: | ||
self.collections[collection_id.id] = _CanonicalCollection(CanonicalCollection(collection_id, now, False), version_id) | ||
else: | ||
new_cc = copy.deepcopy(cc) | ||
new_cc.mapped_version = version_id | ||
self.collections[collection_id.id] = new_cc | ||
self.collections_versions[version_id.id].published_at = now | ||
|
||
# OR | ||
def update_collection_version_mapping(self, collection_id: CollectionId, version_id: CollectionVersionId) -> None: | ||
self.collections[collection_id.id] = version_id.id | ||
# def update_collection_version_mapping(self, collection_id: CollectionId, version_id: CollectionVersionId) -> None: | ||
# self.collections[collection_id.id] = version_id.id | ||
|
||
def set_collection_version_published_at(self, version_id: CollectionVersionId) -> None: | ||
self.collections_versions[version_id.id].published_at = datetime.utcnow() | ||
# def set_collection_version_published_at(self, version_id: CollectionVersionId) -> None: | ||
# self.collections_versions[version_id.id].published_at = datetime.utcnow() | ||
|
||
# END OR | ||
|
||
|
@@ -145,11 +179,6 @@ def get_dataset(self, dataset_id: DatasetId) -> DatasetVersion: | |
def get_dataset_version(self, version_id: DatasetVersionId) -> DatasetVersion: | ||
return copy.deepcopy(self.datasets_versions.get(version_id.id)) | ||
|
||
def get_all_versions_for_collection(self, collection_id: CollectionId) -> Iterable[CollectionVersion]: | ||
for version in self.collections_versions.values(): | ||
if version.collection_id == collection_id: | ||
yield version | ||
|
||
def get_all_datasets(self) -> Iterable[DatasetVersion]: | ||
""" | ||
For now, this only returns all the active datasets, i.e. the datasets that belong to a published collection | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we already maintain published_at in the CollectionVersion, so really we're only pulling in canonical_collection for tombstoned, right? at that point, should we consider just making tombstone as part of CollectionVersion and updating on deletes?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
correction: published_at in the canonical collection is the date where the collection was first published
While not necessary, changing that canonical collection field name to
originally_published_at
would be useful to distinguishThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@atolopko-czi what naming convention do you prefer?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If this timestamp is in fact the first collection version's published_at value, then I like Nayib's suggestion. These also work
{first,initially}_published_at
, but I'm at a loss to pick a "best" choice. :)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
originally_published_at
is the best one imho. I figured theoriginally
was implicitly implied by the fact that belongs to the canonical, but as a wise man once said,explicit is better than implicit
.