From ea17ec3f4b0992661fa388468c947b44eb551901 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Thu, 19 Sep 2024 16:00:20 -0700 Subject: [PATCH 1/7] Determine class hierarchy per-document instead of per-collection The previous code made the invalid assumption that a given collection could only store documents of a single type. That constraint does not exist in databases conforming to the Berkeley schema (nmdc-schema v11+). --- nmdc_runtime/site/ops.py | 61 ++++++++++++++++++++++++---------------- 1 file changed, 37 insertions(+), 24 deletions(-) diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py index f6be4618..7e87cd56 100644 --- a/nmdc_runtime/site/ops.py +++ b/nmdc_runtime/site/ops.py @@ -1022,46 +1022,59 @@ def materialize_alldocs(context) -> int: mdb = context.resources.mongo.db collection_names = populated_schema_collection_names_with_id_field(mdb) - for name in collection_names: - assert ( - len(collection_name_to_class_names[name]) == 1 - ), f"{name} collection has class name of {collection_name_to_class_names[name]} and len {len(collection_name_to_class_names[name])}" + # Insert a no-op as an anchor point for this comment. + # + # Note: There used to be code here that `assert`-ed that each collection could only contain documents of a single + # type. With the legacy schema, that assertion was true. With the Berkeley schema, it is false. That code was + # in place because subsequent code (further below) used a single document in a collection as the source of the + # class ancestry information of _all_ documents in that collection; an optimization that spared us from + # having to do the same for every single document in that collection. With the Berkeley schema, we have + # eliminated that optimization (since it is inadequate; it would produce some incorrect class ancestries + # for descendants of `PlannedProcess`, for example). + # + pass context.log.info(f"{collection_names=}") # Drop any existing `alldocs` collection (e.g. from previous use of this op). + # + # FIXME: This "nuke and pave" approach introduces a race condition. + # For example, if someone were to visit an API endpoint that uses the "alldocs" collection, + # the endpoint would fail to perform its job since the "alldocs" collection is temporarily missing. + # mdb.alldocs.drop() # Build alldocs context.log.info("constructing `alldocs` collection") - for collection in collection_names: - # Calculate class_hierarchy_as_list once per collection, using the first document in list - try: - nmdcdb = NMDCDatabase( - **{collection: [dissoc(mdb[collection].find_one(), "_id")]} - ) - exemplar = getattr(nmdcdb, collection)[0] - newdoc_type: list[str] = class_hierarchy_as_list(exemplar) - except ValueError as e: - context.log.info(f"Collection {collection} does not exist.") - raise e - + for collection_name in collection_names: context.log.info( - f"Found {mdb[collection].estimated_document_count()} estimated documents for {collection=}." + f"Found {mdb[collection_name].estimated_document_count()} estimated documents for {collection_name=}." ) + # For each document in this collection, replace the value of the `type` field with # a _list_ of the document's own class and ancestor classes, remove the `_id` field, # and insert the resulting document into the `alldocs` collection. + num_docs_inserted_for_collection = 0 + for doc in mdb[collection_name].find(): + # Instantiate the Python class represented by the document. + db_dict = {collection_name: [dissoc(doc, "_id")]} + nmdc_db = NMDCDatabase(**db_dict) + doc_as_instance = getattr(nmdc_db, collection_name)[0] + + # Get the ancestry of that instance, as a list of class names (including its own class name). + ancestor_class_names = class_hierarchy_as_list(doc_as_instance) + + # Make a document whose `type` field contains that list of class names. + # Note: This clobbers the existing contents of the `type` field. + new_doc = assoc(dissoc(doc, "type", "_id"), "type", ancestor_class_names) + + # Store this document in the `alldocs` collection. + mdb.alldocs.insert_one(new_doc) + num_docs_inserted_for_collection += 1 - inserted_many_result = mdb.alldocs.insert_many( - [ - assoc(dissoc(doc, "type", "_id"), "type", newdoc_type) - for doc in mdb[collection].find() - ] - ) context.log.info( - f"Inserted {len(inserted_many_result.inserted_ids)} documents for {collection=}." + f"Inserted {num_docs_inserted_for_collection} documents for {collection_name=}." ) # Re-idx for `alldocs` collection From 1c5788d5a1ce13a0aec90482672e0254ca0b0f10 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Thu, 19 Sep 2024 17:24:23 -0700 Subject: [PATCH 2/7] Optimization: Process documents in batches according to their `type` --- nmdc_runtime/site/ops.py | 73 ++++++++++++++++++++++++++++------------ 1 file changed, 51 insertions(+), 22 deletions(-) diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py index 7e87cd56..0bf337e2 100644 --- a/nmdc_runtime/site/ops.py +++ b/nmdc_runtime/site/ops.py @@ -1047,35 +1047,64 @@ def materialize_alldocs(context) -> int: # Build alldocs context.log.info("constructing `alldocs` collection") + # For each collection, group its documents by their `type` value, transform them, and load them into `alldocs`. for collection_name in collection_names: context.log.info( f"Found {mdb[collection_name].estimated_document_count()} estimated documents for {collection_name=}." ) - # For each document in this collection, replace the value of the `type` field with - # a _list_ of the document's own class and ancestor classes, remove the `_id` field, - # and insert the resulting document into the `alldocs` collection. - num_docs_inserted_for_collection = 0 - for doc in mdb[collection_name].find(): - # Instantiate the Python class represented by the document. - db_dict = {collection_name: [dissoc(doc, "_id")]} - nmdc_db = NMDCDatabase(**db_dict) - doc_as_instance = getattr(nmdc_db, collection_name)[0] - - # Get the ancestry of that instance, as a list of class names (including its own class name). - ancestor_class_names = class_hierarchy_as_list(doc_as_instance) - - # Make a document whose `type` field contains that list of class names. - # Note: This clobbers the existing contents of the `type` field. - new_doc = assoc(dissoc(doc, "type", "_id"), "type", ancestor_class_names) + # Process all the distinct `type` values (i.e. value in the `type` field) of the documents in this collection. + # + # References: + # - https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.distinct + # + distinct_type_values = mdb[collection_name].distinct(key="type") + for type_value in distinct_type_values: + + # Process all the documents in this collection that have this value in their `type` field. + # + # References: + # - https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.count_documents + # - https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.find + # + filter_ = {"type": type_value} + num_docs_having_type = mdb[collection_name].count_documents(filter=filter_) + docs_having_type = mdb[collection_name].find(filter=filter_) + context.log.info( + f"Found {len(num_docs_having_type)} documents having {type_value=} in {collection_name=}." + ) - # Store this document in the `alldocs` collection. - mdb.alldocs.insert_one(new_doc) - num_docs_inserted_for_collection += 1 + # Get a "representative" document from the result. + # + # Note: Since all of the documents in this batch have the same class ancestry, we will save time by + # determining the class ancestry of only _one_ of them (we call this the "representative") and then + # (later) attributing that class ancestry to all of them. + # + representative_doc = next(docs_having_type, default=None) - context.log.info( - f"Inserted {num_docs_inserted_for_collection} documents for {collection_name=}." - ) + # Instantiate the Python class represented by the "representative" document. + db_dict = {collection_name: [dissoc(representative_doc, "_id")]} # omits key incompatible with constructor + nmdc_db = NMDCDatabase(**db_dict) + representative_instance = getattr(nmdc_db, collection_name)[0] + + # Get the class ancestry of that instance, as a list of class names (including its own class name). + ancestor_class_names = class_hierarchy_as_list(representative_instance) + + # Store the documents belonging to this group, in the `alldocs` collection, setting their `type` field + # to the list of class names obtained from the "representative" document above. + # + # TODO: Document why clobbering the existing contents of the `type` field is OK. + # + inserted_many_result = mdb.alldocs.insert_many( + [ + assoc(dissoc(doc, "type", "_id"), "type", ancestor_class_names) + for doc in docs_having_type + ] + ) + context.log.info( + f"Inserted {len(inserted_many_result.inserted_ids)} documents originally having {type_value=} " + f"in {collection_name=}." + ) # Re-idx for `alldocs` collection mdb.alldocs.create_index("id", unique=True) From 64c98381c45d456f8eb5ec1b63dfc67802c47865 Mon Sep 17 00:00:00 2001 From: github-actions Date: Fri, 20 Sep 2024 00:24:52 +0000 Subject: [PATCH 3/7] style: reformat --- nmdc_runtime/site/ops.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py index 0bf337e2..8d215d2c 100644 --- a/nmdc_runtime/site/ops.py +++ b/nmdc_runtime/site/ops.py @@ -1083,7 +1083,9 @@ def materialize_alldocs(context) -> int: representative_doc = next(docs_having_type, default=None) # Instantiate the Python class represented by the "representative" document. - db_dict = {collection_name: [dissoc(representative_doc, "_id")]} # omits key incompatible with constructor + db_dict = { + collection_name: [dissoc(representative_doc, "_id")] + } # omits key incompatible with constructor nmdc_db = NMDCDatabase(**db_dict) representative_instance = getattr(nmdc_db, collection_name)[0] From 61fc98489d2308bdb16dcaab5d2c41f3ed81ea16 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Thu, 19 Sep 2024 17:37:53 -0700 Subject: [PATCH 4/7] Fix bug in `log.info` statement --- nmdc_runtime/site/ops.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py index 8d215d2c..b09853dd 100644 --- a/nmdc_runtime/site/ops.py +++ b/nmdc_runtime/site/ops.py @@ -1071,7 +1071,7 @@ def materialize_alldocs(context) -> int: num_docs_having_type = mdb[collection_name].count_documents(filter=filter_) docs_having_type = mdb[collection_name].find(filter=filter_) context.log.info( - f"Found {len(num_docs_having_type)} documents having {type_value=} in {collection_name=}." + f"Found {num_docs_having_type} documents having {type_value=} in {collection_name=}." ) # Get a "representative" document from the result. @@ -1084,8 +1084,9 @@ def materialize_alldocs(context) -> int: # Instantiate the Python class represented by the "representative" document. db_dict = { + # Shed the `_id` attribute, since the constructor doesn't allow it. collection_name: [dissoc(representative_doc, "_id")] - } # omits key incompatible with constructor + } nmdc_db = NMDCDatabase(**db_dict) representative_instance = getattr(nmdc_db, collection_name)[0] From 11e7bed1934e35bb0da4e58b43d8f8ce163c4770 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Thu, 19 Sep 2024 17:40:27 -0700 Subject: [PATCH 5/7] Omit invalid `kwarg` key (oops) --- nmdc_runtime/site/ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py index b09853dd..845f79d0 100644 --- a/nmdc_runtime/site/ops.py +++ b/nmdc_runtime/site/ops.py @@ -1080,7 +1080,7 @@ def materialize_alldocs(context) -> int: # determining the class ancestry of only _one_ of them (we call this the "representative") and then # (later) attributing that class ancestry to all of them. # - representative_doc = next(docs_having_type, default=None) + representative_doc = next(docs_having_type) # Instantiate the Python class represented by the "representative" document. db_dict = { From 06b002d3c78fbb8ab080fd62d0056eb8259ad763 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Thu, 19 Sep 2024 17:43:18 -0700 Subject: [PATCH 6/7] Clarify ambiguous log message --- nmdc_runtime/site/ops.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py index 845f79d0..78bfc962 100644 --- a/nmdc_runtime/site/ops.py +++ b/nmdc_runtime/site/ops.py @@ -1105,8 +1105,8 @@ def materialize_alldocs(context) -> int: ] ) context.log.info( - f"Inserted {len(inserted_many_result.inserted_ids)} documents originally having {type_value=} " - f"in {collection_name=}." + f"Inserted {len(inserted_many_result.inserted_ids)} documents from {collection_name=} " + f"originally having {type_value=}." ) # Re-idx for `alldocs` collection From 6bb4524a8807f8026b797ae123c3f8a94a47344e Mon Sep 17 00:00:00 2001 From: eecavanna Date: Thu, 19 Sep 2024 18:29:50 -0700 Subject: [PATCH 7/7] Add log message showing distinct "type" values in collection --- nmdc_runtime/site/ops.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py index 78bfc962..8b0d8faa 100644 --- a/nmdc_runtime/site/ops.py +++ b/nmdc_runtime/site/ops.py @@ -1059,6 +1059,9 @@ def materialize_alldocs(context) -> int: # - https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.distinct # distinct_type_values = mdb[collection_name].distinct(key="type") + context.log.info( + f"Found {len(distinct_type_values)} distinct `type` values in {collection_name=}: {distinct_type_values=}" + ) for type_value in distinct_type_values: # Process all the documents in this collection that have this value in their `type` field.