microbiomedata · sujaypatil96 · Feb 20, 2025 · Feb 14, 2025 · Feb 18, 2025 · Feb 18, 2025
diff --git a/nmdc_runtime/site/export/ncbi_xml.py b/nmdc_runtime/site/export/ncbi_xml.py
@@ -366,7 +366,12 @@ def set_fastq(
                             )
                             # Currently, we are making the assumption that only one instrument
                             # is used to sequence a Biosample
-                            instrument_id = ntseq.get("instrument_used", "")[0]
+                            instrument_used = ntseq.get("instrument_used", "")
+                            if not instrument_used:
+                                instrument_id = None
+                            else:
+                                instrument_id = instrument_used[0]
+
                             instrument = all_instruments.get(instrument_id, {})
                             instrument_vendor = instrument.get("vendor", "")
                             instrument_model = instrument.get("model", "")
@@ -448,6 +453,20 @@ def set_fastq(
                                 "Attribute", "NextSeq 550", {"name": "instrument_model"}
                             )
                         )
+                    elif instrument_model == "novaseq_6000":
+                        sra_attributes.append(
+                            self.set_element(
+                                "Attribute",
+                                "NovaSeq 6000",
+                                {"name": "instrument_model"},
+                            )
+                        )
+                    elif instrument_model == "hiseq":
+                        sra_attributes.append(
+                            self.set_element(
+                                "Attribute", "HiSeq", {"name": "instrument_model"}
+                            )
+                        )
 
                 if analyte_category == "metagenome":
                     sra_attributes.append(

diff --git a/nmdc_runtime/site/export/ncbi_xml_utils.py b/nmdc_runtime/site/export/ncbi_xml_utils.py
@@ -1,5 +1,7 @@
 from io import BytesIO, StringIO
+from nmdc_runtime.api.endpoints.util import strip_oid
 from nmdc_runtime.minter.config import typecodes
+from nmdc_schema.get_nmdc_view import ViewGetter
 from lxml import etree
 
 import csv
@@ -45,35 +47,41 @@ def get_instruments(instrument_set_collection):
         raise RuntimeError(f"An error occurred while fetching instrument data: {e}")
 
 
-def fetch_data_objects_from_biosamples(all_docs_collection, biosamples_list):
+def fetch_data_objects_from_biosamples(
+    all_docs_collection, data_object_set, biosamples_list
+):
+    biosample_data_objects = []
+
+    def collect_data_objects(doc_ids, collected_objects, unique_ids):
+        for doc_id in doc_ids:
+            if (
+                get_classname_from_typecode(doc_id) == "DataObject"
+                and doc_id not in unique_ids
+            ):
+                data_obj = data_object_set.find_one({"id": doc_id})
+                if data_obj:
+                    collected_objects.append(strip_oid(data_obj))
+                    unique_ids.add(doc_id)
+
     biosample_data_objects = []
 
     for biosample in biosamples_list:
         current_ids = [biosample["id"]]
         collected_data_objects = []
+        unique_ids = set()
 
         while current_ids:
             new_current_ids = []
             for current_id in current_ids:
-                query = {"has_input": current_id}
-                document = all_docs_collection.find_one(query)
+                for doc in all_docs_collection.find({"has_input": current_id}):
+                    has_output = doc.get("has_output", [])
 
-                if not document:
-                    continue
-
-                has_output = document.get("has_output")
-                if not has_output:
-                    continue
-
-                for output_id in has_output:
-                    if get_classname_from_typecode(output_id) == "DataObject":
-                        data_object_doc = all_docs_collection.find_one(
-                            {"id": output_id}
-                        )
-                        if data_object_doc:
-                            collected_data_objects.append(data_object_doc)
-                    else:
-                        new_current_ids.append(output_id)
+                    collect_data_objects(has_output, collected_data_objects, unique_ids)
+                    new_current_ids.extend(
+                        op
+                        for op in has_output
+                        if get_classname_from_typecode(op) != "DataObject"
+                    )
 
             current_ids = new_current_ids
 
@@ -83,7 +91,9 @@ def fetch_data_objects_from_biosamples(all_docs_collection, biosamples_list):
     return biosample_data_objects
 
 
-def fetch_nucleotide_sequencing_from_biosamples(all_docs_collection, biosamples_list):
+def fetch_nucleotide_sequencing_from_biosamples(
+    all_docs_collection, data_generation_set, biosamples_list
+):
     biosample_data_objects = []
 
     for biosample in biosamples_list:
@@ -105,11 +115,13 @@ def fetch_nucleotide_sequencing_from_biosamples(all_docs_collection, biosamples_
 
                 for output_id in has_output:
                     if get_classname_from_typecode(output_id) == "DataObject":
-                        nucleotide_sequencing_doc = all_docs_collection.find_one(
+                        nucleotide_sequencing_doc = data_generation_set.find_one(
                             {"id": document["id"]}
                         )
                         if nucleotide_sequencing_doc:
-                            collected_data_objects.append(nucleotide_sequencing_doc)
+                            collected_data_objects.append(
+                                strip_oid(nucleotide_sequencing_doc)
+                            )
                     else:
                         new_current_ids.append(output_id)
 
@@ -121,7 +133,9 @@ def fetch_nucleotide_sequencing_from_biosamples(all_docs_collection, biosamples_
     return biosample_data_objects
 
 
-def fetch_library_preparation_from_biosamples(all_docs_collection, biosamples_list):
+def fetch_library_preparation_from_biosamples(
+    all_docs_collection, material_processing_set, biosamples_list
+):
     biosample_lib_prep = []
 
     for biosample in biosamples_list:
@@ -144,10 +158,10 @@ def fetch_library_preparation_from_biosamples(all_docs_collection, biosamples_li
                 "has_input": output_id,
                 "type": {"$in": ["LibraryPreparation"]},
             }
-            lib_prep_doc = all_docs_collection.find_one(lib_prep_query)
+            lib_prep_doc = material_processing_set.find_one(lib_prep_query)
 
             if lib_prep_doc:
-                biosample_lib_prep.append({biosample_id: lib_prep_doc})
+                biosample_lib_prep.append({biosample_id: strip_oid(lib_prep_doc)})
                 break  # Stop at the first document that meets the criteria
 
     return biosample_lib_prep

diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py
@@ -1188,8 +1188,9 @@ def get_ncbi_export_pipeline_inputs(context: OpExecutionContext) -> str:
 def get_data_objects_from_biosamples(context: OpExecutionContext, biosamples: list):
     mdb = context.resources.mongo.db
     alldocs_collection = mdb["alldocs"]
+    data_object_set = mdb["data_object_set"]
     biosample_data_objects = fetch_data_objects_from_biosamples(
-        alldocs_collection, biosamples
+        alldocs_collection, data_object_set, biosamples
     )
     return biosample_data_objects
 
@@ -1200,8 +1201,9 @@ def get_nucleotide_sequencing_from_biosamples(
 ):
     mdb = context.resources.mongo.db
     alldocs_collection = mdb["alldocs"]
+    data_generation_set = mdb["data_generation_set"]
     biosample_omics_processing = fetch_nucleotide_sequencing_from_biosamples(
-        alldocs_collection, biosamples
+        alldocs_collection, data_generation_set, biosamples
     )
     return biosample_omics_processing
 
@@ -1212,8 +1214,9 @@ def get_library_preparation_from_biosamples(
 ):
     mdb = context.resources.mongo.db
     alldocs_collection = mdb["alldocs"]
+    material_processing_set = mdb["material_processing_set"]
     biosample_lib_prep = fetch_library_preparation_from_biosamples(
-        alldocs_collection, biosamples
+        alldocs_collection, material_processing_set, biosamples
     )
     return biosample_lib_prep