chanzuckerberg · ebezzi · Nov 22, 2022 · Nov 22, 2022 · Nov 22, 2022 · Nov 22, 2022
diff --git a/backend/layers/common/entities.py b/backend/layers/common/entities.py
@@ -168,6 +168,12 @@ class CollectionMetadata:
     links: List[Link]
 
 
+@dataclass
+class CanonicalCollection:
+    id: CollectionId
+    originally_published_at: Optional[datetime]
+    tombstoned: bool
+
 @dataclass
 class CollectionVersion:
     collection_id: CollectionId
@@ -178,7 +184,7 @@ class CollectionVersion:
     datasets: List[DatasetVersion]
     published_at: Optional[datetime]
     created_at: datetime
-    curator_name: str
+    canonical_collection: CanonicalCollection
 
 
 class CollectionLinkType(Enum):

diff --git a/backend/layers/persistence/persistence_mock.py b/backend/layers/persistence/persistence_mock.py
@@ -1,8 +1,10 @@
+from dataclasses import dataclass
 from datetime import datetime
 import uuid
 from backend.layers.persistence.persistence import DatabaseProviderInterface
 from typing import Dict, Iterable, List, Optional
 from backend.layers.common.entities import (
+    CanonicalCollection,
     CollectionId,
     CollectionMetadata,
     CollectionVersion,
@@ -22,6 +24,11 @@
 import copy
 
 
+@dataclass
+class _CanonicalCollection:
+    canonical_collection: CanonicalCollection
+    mapped_version: CollectionVersionId
+
 class DatabaseProviderMock(DatabaseProviderInterface):
 
     """
@@ -36,7 +43,7 @@ class DatabaseProviderMock(DatabaseProviderInterface):
     """
 
     # A mapping between canonical collection ids and collection versions.
-    collections: Dict[str, str]
+    collections: Dict[str, _CanonicalCollection]
 
     # All the collection versions
     collections_versions: Dict[str, CollectionVersion]
@@ -62,24 +69,40 @@ def _id():
     def create_canonical_collection(self, owner: str, collection_metadata: CollectionMetadata) -> CollectionVersion:
         collection_id = CollectionId(self._id())
         version_id = CollectionVersionId(self._id())
-        version = CollectionVersion(collection_id, version_id, owner, collection_metadata, None, [], None, datetime.utcnow())
+        canonical = CanonicalCollection(collection_id, None, False)
+        version = CollectionVersion(collection_id, version_id, owner, collection_metadata, None, [], None, datetime.utcnow(), canonical)
         self.collections_versions[version_id.id] = version
         # Don't set mappings here - those will be set when publishing the collection!
         return copy.deepcopy(version)
 
+    def _update_version_with_canonical(self, version: CollectionVersion):
+        """
+        Private method that returns a version updated with the canonical collection.
+        This is equivalent to a database double lookup (or join).
+        Note that for methods that require a table scan, this should be optimized by holding the 
+        canonical_collections table in memory
+        """
+        cc = self.collections.get(version.collection_id.id)
+        if cc is None:
+            return version
+        copied_version = copy.deepcopy(version)
+        copied_version.canonical_collection = cc.canonical_collection
+        return copied_version
+
     def get_collection_mapped_version(self, collection_id: CollectionId) -> Optional[CollectionVersion]:
-        version_id = self.collections.get(collection_id.id)
-        if version_id is not None:
-            return copy.deepcopy(self.collections_versions[version_id])
+        cc = self.collections.get(collection_id.id)
+        if cc is not None:
+            version = self.collections_versions[cc.mapped_version.id]
+            return self._update_version_with_canonical(version)
 
     def get_all_collections_versions(self) -> Iterable[CollectionVersion]:  # TODO: add filters if needed
         for version in self.collections_versions.values():
-            yield copy.deepcopy(version)
+            yield self._update_version_with_canonical(version)
 
     def get_all_mapped_collection_versions(self) -> Iterable[CollectionVersion]:  # TODO: add filters if needed
         for version_id, collection_version in self.collections_versions.items():
-            if version_id in self.collections.values():
-                yield copy.deepcopy(collection_version)
+            if version_id in [c.mapped_version.id for c in self.collections.values()]:
+                yield self._update_version_with_canonical(collection_version)
 
     def delete_collection(self, collection_id: CollectionId) -> None:
         del self.collections[collection_id.id]
@@ -91,8 +114,9 @@ def save_collection_publisher_metadata(self, version_id: CollectionVersionId, pu
         self.collections_versions[version_id.id].publisher_metadata = copy.deepcopy(publisher_metadata)
 
     def add_collection_version(self, collection_id: CollectionId) -> CollectionVersion:
-        current_version_id = self.collections[collection_id.id]
-        current_version = self.collections_versions[current_version_id]
+        cc = self.collections[collection_id.id]
+        current_version_id = cc.mapped_version
+        current_version = self.collections_versions[current_version_id.id]
         new_version_id = CollectionVersionId(self._id())
         # Note: since datasets are immutable, there is no need to clone datasets here, 
         # but the list that contains datasets needs to be copied, since it's a pointer.
@@ -106,7 +130,8 @@ def add_collection_version(self, collection_id: CollectionId) -> CollectionVersi
             publisher_metadata=current_version.publisher_metadata,
             datasets=new_dataset_list,
             published_at=None,
-            created_at=datetime.utcnow()
+            created_at=datetime.utcnow(),
+            canonical_collection=cc.canonical_collection
         )
         self.collections_versions[new_version_id.id] = collection_version
         return copy.deepcopy(collection_version)
@@ -116,25 +141,34 @@ def delete_collection_version(self, version_id: CollectionVersionId) -> None:
         del self.collections_versions[version_id.id]
 
     def get_collection_version(self, version_id: CollectionVersionId) -> CollectionVersion:
-        return copy.deepcopy(self.collections_versions.get(version_id.id))
+        version = self.collections_versions.get(version_id.id)
+        if version is not None:
+            return self._update_version_with_canonical(version)
 
     def get_all_versions_for_collection(self, collection_id: CollectionId) -> Iterable[CollectionVersion]:
         # On a database, will require a secondary index on `collection_id` for an optimized lookup
         for collection_version in self.collections_versions.values():
             if collection_version.collection_id == collection_id:
-                yield copy.deepcopy(collection_version)
+                yield self._update_version_with_canonical(collection_version)
 
     # MAYBE
     def finalize_collection_version(self, collection_id: CollectionId, version_id: CollectionVersionId, published_at: Optional[datetime]) -> None:
-        self.collections[collection_id.id] = version_id.id
-        self.collections_versions[version_id.id].published_at = datetime.utcnow()
+        cc = self.collections.get(collection_id.id)
+        now = datetime.utcnow()
+        if cc is None:
+            self.collections[collection_id.id] = _CanonicalCollection(CanonicalCollection(collection_id, now, False), version_id)
+        else:
+            new_cc = copy.deepcopy(cc)
+            new_cc.mapped_version = version_id
+            self.collections[collection_id.id] = new_cc
+        self.collections_versions[version_id.id].published_at = now
 
     # OR
-    def update_collection_version_mapping(self, collection_id: CollectionId, version_id: CollectionVersionId) -> None:
-        self.collections[collection_id.id] = version_id.id
+    # def update_collection_version_mapping(self, collection_id: CollectionId, version_id: CollectionVersionId) -> None:
+    #     self.collections[collection_id.id] = version_id.id
 
-    def set_collection_version_published_at(self, version_id: CollectionVersionId) -> None:
-        self.collections_versions[version_id.id].published_at = datetime.utcnow()
+    # def set_collection_version_published_at(self, version_id: CollectionVersionId) -> None:
+    #     self.collections_versions[version_id.id].published_at = datetime.utcnow()
 
     # END OR
 
@@ -145,11 +179,6 @@ def get_dataset(self, dataset_id: DatasetId) -> DatasetVersion:
     def get_dataset_version(self, version_id: DatasetVersionId) -> DatasetVersion:
         return copy.deepcopy(self.datasets_versions.get(version_id.id))
 
-    def get_all_versions_for_collection(self, collection_id: CollectionId) -> Iterable[CollectionVersion]:
-        for version in self.collections_versions.values():
-            if version.collection_id == collection_id:
-                yield version
-
     def get_all_datasets(self) -> Iterable[DatasetVersion]:  
         """
         For now, this only returns all the active datasets, i.e. the datasets that belong to a published collection

diff --git a/tests/unit/backend/layers/business/test_business.py b/tests/unit/backend/layers/business/test_business.py
@@ -746,6 +746,9 @@ def test_create_collection_version_ok(self):
         # The new version should not be published (aka Private)
         self.assertIsNone(new_version.published_at)
 
+        # The canonical collection should be published
+        self.assertIsNotNone(new_version.canonical_collection.originally_published_at)
+
         # get_collection still retrieves the original version
         version = self.business_logic.get_published_collection_version(published_collection.collection_id)
         self.assertEqual(version.version_id, published_collection.version_id)
@@ -812,14 +815,18 @@ def test_publish_version_ok(self):
 
         published_version = self.database_provider.get_collection_version(unpublished_collection.version_id)
         self.assertIsNotNone(published_version.published_at) # TODO: ideally, do a date assertion here (requires mocking)
+        self.assertIsNotNone(published_version.canonical_collection.originally_published_at)
+        self.assertEqual(published_version.published_at, published_version.canonical_collection.originally_published_at)
 
         # The published and unpublished collection have the same collection_id and version_id
         self.assertEqual(published_version.collection_id, unpublished_collection.collection_id)
         self.assertEqual(published_version.version_id, unpublished_collection.version_id)
 
         # get_collection retrieves the correct version
         version = self.business_logic.get_published_collection_version(unpublished_collection.collection_id)
-        self.assertEqual(version.version_id, published_version.version_id)
+        if version: # pylance
+            self.assertEqual(version.version_id, published_version.version_id)
+
 
     def test_publish_collection_with_no_datasets_fail(self):
         """
@@ -995,6 +1002,19 @@ def test_publish_version_with_replaced_dataset_ok(self):
         self.assertCountEqual([replaced_dataset_version_id, dataset_id_to_keep], [d.version_id for d in version.datasets])
 
 
+    def test_publish_version_does_not_change_original_published_at_ok(self):
+        """
+        When publishing a collection version, the published_at for the canonical collection
+        should not change
+        """
+        first_version = self.initialize_published_collection()
+        second_version = self.business_logic.create_collection_version(first_version.collection_id)
+        self.business_logic.publish_collection_version(second_version.version_id)
+
+        canonical = self.business_logic.get_collection_version(second_version.version_id).canonical_collection
+        self.assertEqual(canonical.originally_published_at, first_version.published_at)
+
+
     def test_get_all_collections_published_does_not_retrieve_old_versions(self):
         """
         `get_collections` with is_published=True should not return versions that were previously
@@ -1010,6 +1030,11 @@ def test_get_all_collections_published_does_not_retrieve_old_versions(self):
         self.assertEqual(1, len(all_collections))
         self.assertEqual(all_collections[0].version_id, second_version.version_id)
 
+        # The canonical collection published_at should point to the original publication time
+        self.assertNotEqual(all_collections[0].canonical_collection.originally_published_at, all_collections[0].published_at)
+        self.assertEqual(all_collections[0].canonical_collection.originally_published_at, first_version.published_at)
+
+
     def test_get_collection_versions_for_canonical_ok(self):
         """
         `get_collection_versions_from_canonical` can be used to retrieve all the versions