From ea17ec3f4b0992661fa388468c947b44eb551901 Mon Sep 17 00:00:00 2001
From: eecavanna <eecavanna@users.noreply.github.com>
Date: Thu, 19 Sep 2024 16:00:20 -0700
Subject: [PATCH 1/7] Determine class hierarchy per-document instead of
 per-collection

The previous code made the invalid assumption that a given collection
could only store documents of a single type. That constraint does not
exist in databases conforming to the Berkeley schema (nmdc-schema v11+).
---
 nmdc_runtime/site/ops.py | 61 ++++++++++++++++++++++++----------------
 1 file changed, 37 insertions(+), 24 deletions(-)

diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py
index f6be4618..7e87cd56 100644
--- a/nmdc_runtime/site/ops.py
+++ b/nmdc_runtime/site/ops.py
@@ -1022,46 +1022,59 @@ def materialize_alldocs(context) -> int:
     mdb = context.resources.mongo.db
     collection_names = populated_schema_collection_names_with_id_field(mdb)
 
-    for name in collection_names:
-        assert (
-            len(collection_name_to_class_names[name]) == 1
-        ), f"{name} collection has class name of {collection_name_to_class_names[name]} and len {len(collection_name_to_class_names[name])}"
+    # Insert a no-op as an anchor point for this comment.
+    #
+    # Note: There used to be code here that `assert`-ed that each collection could only contain documents of a single
+    #       type. With the legacy schema, that assertion was true. With the Berkeley schema, it is false. That code was
+    #       in place because subsequent code (further below) used a single document in a collection as the source of the
+    #       class ancestry information of _all_ documents in that collection; an optimization that spared us from
+    #       having to do the same for every single document in that collection. With the Berkeley schema, we have
+    #       eliminated that optimization (since it is inadequate; it would produce some incorrect class ancestries
+    #       for descendants of `PlannedProcess`, for example).
+    #
+    pass
 
     context.log.info(f"{collection_names=}")
 
     # Drop any existing `alldocs` collection (e.g. from previous use of this op).
+    #
+    # FIXME: This "nuke and pave" approach introduces a race condition.
+    #        For example, if someone were to visit an API endpoint that uses the "alldocs" collection,
+    #        the endpoint would fail to perform its job since the "alldocs" collection is temporarily missing.
+    #
     mdb.alldocs.drop()
 
     # Build alldocs
     context.log.info("constructing `alldocs` collection")
 
-    for collection in collection_names:
-        # Calculate class_hierarchy_as_list once per collection, using the first document in list
-        try:
-            nmdcdb = NMDCDatabase(
-                **{collection: [dissoc(mdb[collection].find_one(), "_id")]}
-            )
-            exemplar = getattr(nmdcdb, collection)[0]
-            newdoc_type: list[str] = class_hierarchy_as_list(exemplar)
-        except ValueError as e:
-            context.log.info(f"Collection {collection} does not exist.")
-            raise e
-
+    for collection_name in collection_names:
         context.log.info(
-            f"Found {mdb[collection].estimated_document_count()} estimated documents for {collection=}."
+            f"Found {mdb[collection_name].estimated_document_count()} estimated documents for {collection_name=}."
         )
+
         # For each document in this collection, replace the value of the `type` field with
         # a _list_ of the document's own class and ancestor classes, remove the `_id` field,
         # and insert the resulting document into the `alldocs` collection.
+        num_docs_inserted_for_collection = 0
+        for doc in mdb[collection_name].find():
+            # Instantiate the Python class represented by the document.
+            db_dict = {collection_name: [dissoc(doc, "_id")]}
+            nmdc_db = NMDCDatabase(**db_dict)
+            doc_as_instance = getattr(nmdc_db, collection_name)[0]
+
+            # Get the ancestry of that instance, as a list of class names (including its own class name).
+            ancestor_class_names = class_hierarchy_as_list(doc_as_instance)
+
+            # Make a document whose `type` field contains that list of class names.
+            # Note: This clobbers the existing contents of the `type` field.
+            new_doc = assoc(dissoc(doc, "type", "_id"), "type", ancestor_class_names)
+
+            # Store this document in the `alldocs` collection.
+            mdb.alldocs.insert_one(new_doc)
+            num_docs_inserted_for_collection += 1
 
-        inserted_many_result = mdb.alldocs.insert_many(
-            [
-                assoc(dissoc(doc, "type", "_id"), "type", newdoc_type)
-                for doc in mdb[collection].find()
-            ]
-        )
         context.log.info(
-            f"Inserted {len(inserted_many_result.inserted_ids)} documents for {collection=}."
+            f"Inserted {num_docs_inserted_for_collection} documents for {collection_name=}."
         )
 
     # Re-idx for `alldocs` collection

From 1c5788d5a1ce13a0aec90482672e0254ca0b0f10 Mon Sep 17 00:00:00 2001
From: eecavanna <eecavanna@users.noreply.github.com>
Date: Thu, 19 Sep 2024 17:24:23 -0700
Subject: [PATCH 2/7] Optimization: Process documents in batches according to
 their `type`

---
 nmdc_runtime/site/ops.py | 73 ++++++++++++++++++++++++++++------------
 1 file changed, 51 insertions(+), 22 deletions(-)

diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py
index 7e87cd56..0bf337e2 100644
--- a/nmdc_runtime/site/ops.py
+++ b/nmdc_runtime/site/ops.py
@@ -1047,35 +1047,64 @@ def materialize_alldocs(context) -> int:
     # Build alldocs
     context.log.info("constructing `alldocs` collection")
 
+    # For each collection, group its documents by their `type` value, transform them, and load them into `alldocs`.
     for collection_name in collection_names:
         context.log.info(
             f"Found {mdb[collection_name].estimated_document_count()} estimated documents for {collection_name=}."
         )
 
-        # For each document in this collection, replace the value of the `type` field with
-        # a _list_ of the document's own class and ancestor classes, remove the `_id` field,
-        # and insert the resulting document into the `alldocs` collection.
-        num_docs_inserted_for_collection = 0
-        for doc in mdb[collection_name].find():
-            # Instantiate the Python class represented by the document.
-            db_dict = {collection_name: [dissoc(doc, "_id")]}
-            nmdc_db = NMDCDatabase(**db_dict)
-            doc_as_instance = getattr(nmdc_db, collection_name)[0]
-
-            # Get the ancestry of that instance, as a list of class names (including its own class name).
-            ancestor_class_names = class_hierarchy_as_list(doc_as_instance)
-
-            # Make a document whose `type` field contains that list of class names.
-            # Note: This clobbers the existing contents of the `type` field.
-            new_doc = assoc(dissoc(doc, "type", "_id"), "type", ancestor_class_names)
+        # Process all the distinct `type` values (i.e. value in the `type` field) of the documents in this collection.
+        #
+        # References:
+        # - https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.distinct
+        #
+        distinct_type_values = mdb[collection_name].distinct(key="type")
+        for type_value in distinct_type_values:
+
+            # Process all the documents in this collection that have this value in their `type` field.
+            #
+            # References:
+            # - https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.count_documents
+            # - https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.find
+            #
+            filter_ = {"type": type_value}
+            num_docs_having_type = mdb[collection_name].count_documents(filter=filter_)
+            docs_having_type = mdb[collection_name].find(filter=filter_)
+            context.log.info(
+                f"Found {len(num_docs_having_type)} documents having {type_value=} in {collection_name=}."
+            )
 
-            # Store this document in the `alldocs` collection.
-            mdb.alldocs.insert_one(new_doc)
-            num_docs_inserted_for_collection += 1
+            # Get a "representative" document from the result.
+            #
+            # Note: Since all of the documents in this batch have the same class ancestry, we will save time by
+            #       determining the class ancestry of only _one_ of them (we call this the "representative") and then
+            #       (later) attributing that class ancestry to all of them.
+            #
+            representative_doc = next(docs_having_type, default=None)
 
-        context.log.info(
-            f"Inserted {num_docs_inserted_for_collection} documents for {collection_name=}."
-        )
+            # Instantiate the Python class represented by the "representative" document.
+            db_dict = {collection_name: [dissoc(representative_doc, "_id")]}  # omits key incompatible with constructor
+            nmdc_db = NMDCDatabase(**db_dict)
+            representative_instance = getattr(nmdc_db, collection_name)[0]
+
+            # Get the class ancestry of that instance, as a list of class names (including its own class name).
+            ancestor_class_names = class_hierarchy_as_list(representative_instance)
+
+            # Store the documents belonging to this group, in the `alldocs` collection, setting their `type` field
+            # to the list of class names obtained from the "representative" document above.
+            #
+            # TODO: Document why clobbering the existing contents of the `type` field is OK.
+            #
+            inserted_many_result = mdb.alldocs.insert_many(
+                [
+                    assoc(dissoc(doc, "type", "_id"), "type", ancestor_class_names)
+                    for doc in docs_having_type
+                ]
+            )
+            context.log.info(
+                f"Inserted {len(inserted_many_result.inserted_ids)} documents originally having {type_value=} "
+                f"in {collection_name=}."
+            )
 
     # Re-idx for `alldocs` collection
     mdb.alldocs.create_index("id", unique=True)

From 64c98381c45d456f8eb5ec1b63dfc67802c47865 Mon Sep 17 00:00:00 2001
From: github-actions <github-actions@github.com>
Date: Fri, 20 Sep 2024 00:24:52 +0000
Subject: [PATCH 3/7] style: reformat

---
 nmdc_runtime/site/ops.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py
index 0bf337e2..8d215d2c 100644
--- a/nmdc_runtime/site/ops.py
+++ b/nmdc_runtime/site/ops.py
@@ -1083,7 +1083,9 @@ def materialize_alldocs(context) -> int:
             representative_doc = next(docs_having_type, default=None)
 
             # Instantiate the Python class represented by the "representative" document.
-            db_dict = {collection_name: [dissoc(representative_doc, "_id")]}  # omits key incompatible with constructor
+            db_dict = {
+                collection_name: [dissoc(representative_doc, "_id")]
+            }  # omits key incompatible with constructor
             nmdc_db = NMDCDatabase(**db_dict)
             representative_instance = getattr(nmdc_db, collection_name)[0]
 

From 61fc98489d2308bdb16dcaab5d2c41f3ed81ea16 Mon Sep 17 00:00:00 2001
From: eecavanna <eecavanna@users.noreply.github.com>
Date: Thu, 19 Sep 2024 17:37:53 -0700
Subject: [PATCH 4/7] Fix bug in `log.info` statement

---
 nmdc_runtime/site/ops.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py
index 8d215d2c..b09853dd 100644
--- a/nmdc_runtime/site/ops.py
+++ b/nmdc_runtime/site/ops.py
@@ -1071,7 +1071,7 @@ def materialize_alldocs(context) -> int:
             num_docs_having_type = mdb[collection_name].count_documents(filter=filter_)
             docs_having_type = mdb[collection_name].find(filter=filter_)
             context.log.info(
-                f"Found {len(num_docs_having_type)} documents having {type_value=} in {collection_name=}."
+                f"Found {num_docs_having_type} documents having {type_value=} in {collection_name=}."
             )
 
             # Get a "representative" document from the result.
@@ -1084,8 +1084,9 @@ def materialize_alldocs(context) -> int:
 
             # Instantiate the Python class represented by the "representative" document.
             db_dict = {
+                # Shed the `_id` attribute, since the constructor doesn't allow it.
                 collection_name: [dissoc(representative_doc, "_id")]
-            }  # omits key incompatible with constructor
+            }
             nmdc_db = NMDCDatabase(**db_dict)
             representative_instance = getattr(nmdc_db, collection_name)[0]
 

From 11e7bed1934e35bb0da4e58b43d8f8ce163c4770 Mon Sep 17 00:00:00 2001
From: eecavanna <eecavanna@users.noreply.github.com>
Date: Thu, 19 Sep 2024 17:40:27 -0700
Subject: [PATCH 5/7] Omit invalid `kwarg` key (oops)

---
 nmdc_runtime/site/ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py
index b09853dd..845f79d0 100644
--- a/nmdc_runtime/site/ops.py
+++ b/nmdc_runtime/site/ops.py
@@ -1080,7 +1080,7 @@ def materialize_alldocs(context) -> int:
             #       determining the class ancestry of only _one_ of them (we call this the "representative") and then
             #       (later) attributing that class ancestry to all of them.
             #
-            representative_doc = next(docs_having_type, default=None)
+            representative_doc = next(docs_having_type)
 
             # Instantiate the Python class represented by the "representative" document.
             db_dict = {

From 06b002d3c78fbb8ab080fd62d0056eb8259ad763 Mon Sep 17 00:00:00 2001
From: eecavanna <eecavanna@users.noreply.github.com>
Date: Thu, 19 Sep 2024 17:43:18 -0700
Subject: [PATCH 6/7] Clarify ambiguous log message

---
 nmdc_runtime/site/ops.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py
index 845f79d0..78bfc962 100644
--- a/nmdc_runtime/site/ops.py
+++ b/nmdc_runtime/site/ops.py
@@ -1105,8 +1105,8 @@ def materialize_alldocs(context) -> int:
                 ]
             )
             context.log.info(
-                f"Inserted {len(inserted_many_result.inserted_ids)} documents originally having {type_value=} "
-                f"in {collection_name=}."
+                f"Inserted {len(inserted_many_result.inserted_ids)} documents from {collection_name=} "
+                f"originally having {type_value=}."
             )
 
     # Re-idx for `alldocs` collection

From 6bb4524a8807f8026b797ae123c3f8a94a47344e Mon Sep 17 00:00:00 2001
From: eecavanna <eecavanna@users.noreply.github.com>
Date: Thu, 19 Sep 2024 18:29:50 -0700
Subject: [PATCH 7/7] Add log message showing distinct "type" values in
 collection

---
 nmdc_runtime/site/ops.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py
index 78bfc962..8b0d8faa 100644
--- a/nmdc_runtime/site/ops.py
+++ b/nmdc_runtime/site/ops.py
@@ -1059,6 +1059,9 @@ def materialize_alldocs(context) -> int:
         # - https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.distinct
         #
         distinct_type_values = mdb[collection_name].distinct(key="type")
+        context.log.info(
+            f"Found {len(distinct_type_values)} distinct `type` values in {collection_name=}: {distinct_type_values=}"
+        )
         for type_value in distinct_type_values:
 
             # Process all the documents in this collection that have this value in their `type` field.