From f3ba2569f54b2401659159eb6033b87cbe2245ed Mon Sep 17 00:00:00 2001 From: Donny Winston Date: Wed, 7 Aug 2024 18:17:31 +0200 Subject: [PATCH 1/4] fix: allow "update" of non-`id`-having document collections interpret as simple insertion. leave note in code about decision to insist on schema-supplied uniqueness signal. fix #611 --- nmdc_runtime/api/db/mongo.py | 1 + nmdc_runtime/site/ops.py | 19 +++++++++++++------ nmdc_runtime/site/util.py | 11 ++++------- 3 files changed, 18 insertions(+), 13 deletions(-) diff --git a/nmdc_runtime/api/db/mongo.py b/nmdc_runtime/api/db/mongo.py index 51809f54..23770710 100644 --- a/nmdc_runtime/api/db/mongo.py +++ b/nmdc_runtime/api/db/mongo.py @@ -62,6 +62,7 @@ def nmdc_schema_collection_names(mdb: MongoDatabase) -> Set[str]: return {name for name in names if mdb[name].estimated_document_count() > 0} +@lru_cache def get_collection_names_from_schema() -> list[str]: """ Returns the names of the slots of the `Database` class that describe database collections. diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py index 082ba4f9..2fdaee92 100644 --- a/nmdc_runtime/site/ops.py +++ b/nmdc_runtime/site/ops.py @@ -87,7 +87,7 @@ from nmdc_runtime.site.translation.submission_portal_translator import ( SubmissionPortalTranslator, ) -from nmdc_runtime.site.util import collection_indexed_on_id, run_and_log +from nmdc_runtime.site.util import run_and_log, schema_collection_has_index_on_id from nmdc_runtime.util import ( drs_object_in_for, pluralize, @@ -527,16 +527,23 @@ def perform_mongo_updates(context, json_in): if rv["result"] == "errors": raise Failure(str(rv["detail"])) - coll_has_id_index = collection_indexed_on_id(mongo.db) - if all(coll_has_id_index[coll] for coll in docs.keys()): + coll_index_on_id_map = schema_collection_has_index_on_id(mongo.db) + if all(coll_index_on_id_map[coll] for coll in docs.keys()): replace = True - elif all(not coll_has_id_index[coll] for coll in docs.keys()): + elif all(not coll_index_on_id_map[coll] for coll in docs.keys()): + # XXX: This is a hack because e.g. + # documents should be unique with compound key (metagenome_annotation_id, gene_function_id) + # and yet this is not explicit in the schema. One potential solution is to auto-generate an `id` + # as a deterministic hash of the compound key. + # + # For now, decision is to potentially re-insert "duplicate" documents, i.e. to interpret + # lack of `id` as lack of unique document identity for de-duplication. replace = False # wasting time trying to upsert by `id`. else: colls_not_id_indexed = [ - coll for coll in docs.keys() if not coll_has_id_index[coll] + coll for coll in docs.keys() if not coll_index_on_id_map[coll] ] - colls_id_indexed = [coll for coll in docs.keys() if coll_has_id_index[coll]] + colls_id_indexed = [coll for coll in docs.keys() if coll_index_on_id_map[coll]] raise Failure( "Simultaneous addition of non-`id`ed collections and `id`-ed collections" " is not supported at this time." diff --git a/nmdc_runtime/site/util.py b/nmdc_runtime/site/util.py index 614fcc31..0cd9b7ff 100644 --- a/nmdc_runtime/site/util.py +++ b/nmdc_runtime/site/util.py @@ -4,6 +4,7 @@ from pymongo.database import Database as MongoDatabase +from nmdc_runtime.api.db.mongo import get_collection_names_from_schema from nmdc_runtime.site.resources import mongo_resource mode_test = { @@ -34,13 +35,9 @@ def run_and_log(shell_cmd, context): @lru_cache -def collection_indexed_on_id(mdb: MongoDatabase) -> dict: - set_collection_names = [ - name for name in mdb.list_collection_names() if name.endswith("_set") - ] - return { - name: ("id_1" in mdb[name].index_information()) for name in set_collection_names - } +def schema_collection_has_index_on_id(mdb: MongoDatabase) -> dict: + names = set(mdb.list_collection_names()) & set(get_collection_names_from_schema()) + return {name: ("id_1" in mdb[name].index_information()) for name in names} def get_basename(filename: str) -> str: From 93f4cd69525cb8a058763ac7f9d062732438e5fc Mon Sep 17 00:00:00 2001 From: Donny Winston Date: Wed, 7 Aug 2024 19:17:30 +0200 Subject: [PATCH 2/4] refactor to add test --- nmdc_runtime/site/ops.py | 22 ++++++--- nmdc_runtime/site/util.py | 9 +++- tests/test_api/test_endpoints.py | 15 ++++++ tests/test_ops/test_hello.py | 8 ---- tests/test_ops/test_ops.py | 79 ++++++++++++++++++++++++++++++++ 5 files changed, 116 insertions(+), 17 deletions(-) delete mode 100644 tests/test_ops/test_hello.py create mode 100644 tests/test_ops/test_ops.py diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py index 2fdaee92..8345c754 100644 --- a/nmdc_runtime/site/ops.py +++ b/nmdc_runtime/site/ops.py @@ -75,6 +75,7 @@ RuntimeApiSiteClient, RuntimeApiUserClient, NeonApiClient, + MongoDB as MongoDBResource, ) from nmdc_runtime.site.translation.gold_translator import GoldStudyTranslator from nmdc_runtime.site.translation.neon_soil_translator import NeonSoilDataTranslator @@ -527,6 +528,19 @@ def perform_mongo_updates(context, json_in): if rv["result"] == "errors": raise Failure(str(rv["detail"])) + add_docs_result = _add_schema_docs_with_or_without_replacement() + op_patch = UpdateOperationRequest( + done=True, + result=add_docs_result, + metadata={"done_at": datetime.now(timezone.utc).isoformat(timespec="seconds")}, + ) + op_doc = client.update_operation(op_id, op_patch).json() + return ["/operations/" + op_doc["id"]] + + +def _add_schema_docs_with_or_without_replacement( + mongo: MongoDBResource, docs: Dict[str, list] +): coll_index_on_id_map = schema_collection_has_index_on_id(mongo.db) if all(coll_index_on_id_map[coll] for coll in docs.keys()): replace = True @@ -550,13 +564,7 @@ def perform_mongo_updates(context, json_in): f"{colls_not_id_indexed=} ; {colls_id_indexed=}" ) op_result = mongo.add_docs(docs, validate=False, replace=replace) - op_patch = UpdateOperationRequest( - done=True, - result=mongo_add_docs_result_as_dict(op_result), - metadata={"done_at": datetime.now(timezone.utc).isoformat(timespec="seconds")}, - ) - op_doc = client.update_operation(op_id, op_patch).json() - return ["/operations/" + op_doc["id"]] + return mongo_add_docs_result_as_dict(op_result) @op(required_resource_keys={"mongo"}) diff --git a/nmdc_runtime/site/util.py b/nmdc_runtime/site/util.py index 0cd9b7ff..4280fe65 100644 --- a/nmdc_runtime/site/util.py +++ b/nmdc_runtime/site/util.py @@ -36,8 +36,13 @@ def run_and_log(shell_cmd, context): @lru_cache def schema_collection_has_index_on_id(mdb: MongoDatabase) -> dict: - names = set(mdb.list_collection_names()) & set(get_collection_names_from_schema()) - return {name: ("id_1" in mdb[name].index_information()) for name in names} + present_collection_names = set(mdb.list_collection_names()) + return { + name: ( + name in present_collection_names and "id_1" in mdb[name].index_information() + ) + for name in get_collection_names_from_schema() + } def get_basename(filename: str) -> str: diff --git a/tests/test_api/test_endpoints.py b/tests/test_api/test_endpoints.py index 2f7dd486..552db772 100644 --- a/tests/test_api/test_endpoints.py +++ b/tests/test_api/test_endpoints.py @@ -194,6 +194,21 @@ def test_metadata_validate_json_with_unknown_collection(api_site_client): assert rv.json()["result"] == "errors" +def test_metadata_submit_json_functional_annotation_agg(api_site_client): + rv = api_site_client.request( + "POST", + "/metadata/json:submit", + { + "field_research_site_set": [ + {"id": "nmdc:frsite-11-s2dqk408", "name": "BESC-470-CL2_38_23"}, + {"id": "nmdc:frsite-11-s2dqk408", "name": "BESC-470-CL2_38_23"}, + {"id": "nmdc:frsite-11-s2dqk408", "name": "BESC-470-CL2_38_23"}, + ] + }, + ) + assert rv.json()["result"] == "errors" + + def test_submit_changesheet(): sheet_in = ChangesheetIn( name="sheet", diff --git a/tests/test_ops/test_hello.py b/tests/test_ops/test_hello.py deleted file mode 100644 index 8afb925d..00000000 --- a/tests/test_ops/test_hello.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import build_op_context - -from nmdc_runtime.site.ops import hello - - -def test_hello(): - context = build_op_context() - assert hello(context) == "Hello, NMDC!" diff --git a/tests/test_ops/test_ops.py b/tests/test_ops/test_ops.py new file mode 100644 index 00000000..4c4fa04a --- /dev/null +++ b/tests/test_ops/test_ops.py @@ -0,0 +1,79 @@ +import json +import os + +import pytest +from dagster import build_op_context + +from nmdc_runtime.api.endpoints.util import persist_content_and_get_drs_object +from nmdc_runtime.site.resources import ( + mongo_resource, + runtime_api_site_client_resource, + RuntimeApiSiteClient, + runtime_api_user_client_resource, + RuntimeApiUserClient, +) + +from nmdc_runtime.site.ops import ( + perform_mongo_updates, + _add_schema_docs_with_or_without_replacement, +) + + +@pytest.fixture +def op_context(): + return build_op_context( + resources={ + "mongo": mongo_resource.configured( + { + "dbname": os.getenv("MONGO_DBNAME"), + "host": os.getenv("MONGO_HOST"), + "password": os.getenv("MONGO_PASSWORD"), + "username": os.getenv("MONGO_USERNAME"), + } + ), + "runtime_api_user_client": runtime_api_user_client_resource.configured( + { + "base_url": os.getenv("API_HOST"), + "username": os.getenv("API_ADMIN_USER"), + "password": os.getenv("API_ADMIN_PASS"), + }, + ), + "runtime_api_site_client": runtime_api_site_client_resource.configured( + { + "base_url": os.getenv("API_HOST"), + "site_id": os.getenv("API_SITE_ID"), + "client_id": os.getenv("API_SITE_CLIENT_ID"), + "client_secret": os.getenv("API_SITE_CLIENT_SECRET"), + } + ), + } + ) + + +def test_perform_mongo_updates_functional_annotation_agg(op_context): + mongo = op_context.resources.mongo + docs = { + "functional_annotation_agg": [ + { + "metagenome_annotation_id": "nmdc:wfmtan-13-hemh0a82.1", + "gene_function_id": "KEGG.ORTHOLOGY:K00005", + "count": 10, + }, + { + "metagenome_annotation_id": "nmdc:wfmtan-13-hemh0a82.1", + "gene_function_id": "KEGG.ORTHOLOGY:K01426", + "count": 5, + }, + ] + } + # Ensure the docs are not already in the test database. + for doc_spec in docs["functional_annotation_agg"]: + mongo.db.functional_annotation_agg.delete_many(doc_spec) + + _add_schema_docs_with_or_without_replacement(mongo, docs) + assert ( + mongo.db.functional_annotation_agg.count_documents( + {"$or": docs["functional_annotation_agg"]} + ) + == 2 + ) From 4fb2a973e2b39ce17c4cbb67370af3fb223a9b42 Mon Sep 17 00:00:00 2001 From: Donny Winston Date: Wed, 7 Aug 2024 19:36:14 +0200 Subject: [PATCH 3/4] fix: rm abandoned candidate test --- tests/test_api/test_endpoints.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/tests/test_api/test_endpoints.py b/tests/test_api/test_endpoints.py index 552db772..2f7dd486 100644 --- a/tests/test_api/test_endpoints.py +++ b/tests/test_api/test_endpoints.py @@ -194,21 +194,6 @@ def test_metadata_validate_json_with_unknown_collection(api_site_client): assert rv.json()["result"] == "errors" -def test_metadata_submit_json_functional_annotation_agg(api_site_client): - rv = api_site_client.request( - "POST", - "/metadata/json:submit", - { - "field_research_site_set": [ - {"id": "nmdc:frsite-11-s2dqk408", "name": "BESC-470-CL2_38_23"}, - {"id": "nmdc:frsite-11-s2dqk408", "name": "BESC-470-CL2_38_23"}, - {"id": "nmdc:frsite-11-s2dqk408", "name": "BESC-470-CL2_38_23"}, - ] - }, - ) - assert rv.json()["result"] == "errors" - - def test_submit_changesheet(): sheet_in = ChangesheetIn( name="sheet", From b8968838ae8bdabfed75771a7c200e9954587af6 Mon Sep 17 00:00:00 2001 From: Donny Winston Date: Thu, 8 Aug 2024 10:18:32 +0200 Subject: [PATCH 4/4] Update nmdc_runtime/site/ops.py Co-authored-by: eecavanna <134325062+eecavanna@users.noreply.github.com> --- nmdc_runtime/site/ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py index 8345c754..e96554db 100644 --- a/nmdc_runtime/site/ops.py +++ b/nmdc_runtime/site/ops.py @@ -545,7 +545,7 @@ def _add_schema_docs_with_or_without_replacement( if all(coll_index_on_id_map[coll] for coll in docs.keys()): replace = True elif all(not coll_index_on_id_map[coll] for coll in docs.keys()): - # XXX: This is a hack because e.g. + # FIXME: XXX: This is a hack because e.g. # documents should be unique with compound key (metagenome_annotation_id, gene_function_id) # and yet this is not explicit in the schema. One potential solution is to auto-generate an `id` # as a deterministic hash of the compound key.