Skip to content

Commit

Permalink
Add mapping for multiple environments
Browse files Browse the repository at this point in the history
  • Loading branch information
marySalvi committed Oct 29, 2024
1 parent 9ec7abb commit f2c0593
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 23 deletions.
68 changes: 48 additions & 20 deletions nmdc_runtime/site/translation/submission_portal_translator.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,44 @@
import logging
import re
from datetime import datetime
from enum import Enum
from functools import lru_cache
from importlib import resources
from typing import Any, List, Optional, Union

from linkml_runtime import SchemaView
from linkml_runtime.linkml_model import SlotDefinition
from nmdc_schema import nmdc
from toolz import get_in, groupby, concat, valmap, dissoc
from toolz import concat, dissoc, get_in, groupby, valmap

from nmdc_runtime.site.translation.translator import JSON_OBJECT, Translator


BIOSAMPLE_UNIQUE_KEY_SLOT = "samp_name"


class EnvironmentType(Enum):
r"""
Enumeration of all possible environment types.
>>> EnvironmentType.AIR.value
'air'
>>> EnvironmentType.SEDIMENT.value
'sediment'
"""

AIR = "air"
BIOFILM = "microbial mat_biofilm"
BUILT_ENV = "built environment"
HCR_CORES = "hydrocardbon resources-cores"
HRC_FLUID_SWABS = "hydrocarbon resources-fluids_swabs"
HOST_ASSOCIATED = "host-associated"
MISC_ENVS = "miscellaneous natural or artificial environment"
PLANT_ASSOCIATED = "plant-associated"
SEDIMENT = "sediment"
SOIL = "soil"
WATER = "water"


@lru_cache
def _get_schema_view():
"""Return a SchemaView instance representing the NMDC schema"""
Expand Down Expand Up @@ -599,25 +622,30 @@ def get_database(self) -> nmdc.Database:
]

sample_data = metadata_submission_data.get("sampleData", {})
package_name = metadata_submission_data["packageName"]
sample_data_by_id = groupby(
BIOSAMPLE_UNIQUE_KEY_SLOT, concat(sample_data.values())
)
nmdc_biosample_ids = self._id_minter("nmdc:Biosample", len(sample_data_by_id))
sample_data_to_nmdc_biosample_ids = dict(
zip(sample_data_by_id.keys(), nmdc_biosample_ids)
)

database.biosample_set = [
self._translate_biosample(
sample_data,
nmdc_biosample_id=sample_data_to_nmdc_biosample_ids[sample_data_id],
nmdc_study_id=nmdc_study_id,
default_env_package=package_name,
for key in sample_data.keys():
env = key.rsplit("_", 1)[0].upper()
package_name = EnvironmentType(env).value
sample_data_by_id = groupby(
lambda sample: (sample[BIOSAMPLE_UNIQUE_KEY_SLOT], package_name),
concat(sample_data.values()),
)
for sample_data_id, sample_data in sample_data_by_id.items()
if sample_data
]
nmdc_biosample_ids = self._id_minter(
"nmdc:Biosample", len(sample_data_by_id)
)
sample_data_to_nmdc_biosample_ids = dict(
zip(sample_data_by_id.keys(), nmdc_biosample_ids)
)

database.biosample_set = [
self._translate_biosample(
sample_data,
nmdc_biosample_id=sample_data_to_nmdc_biosample_ids[sample_data_id],
nmdc_study_id=nmdc_study_id,
default_env_package=package_name,
)
for sample_data_id, sample_data in sample_data_by_id.items()
if sample_data
]

if self.nucleotide_sequencing_mapping:
# If there is data from an NucleotideSequencing mapping file, process it now. This part
Expand Down
14 changes: 11 additions & 3 deletions tests/test_data/test_submission_portal_translator_data.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
input:
metadata_submission:
metadata_submission:
packageName: plant-associated
packageName: ['plant-associated', 'air']
contextForm:
datasetDoi: doi:10.46936/10.25585/60000818
dataGenerated: true
Expand Down Expand Up @@ -57,7 +57,7 @@ input:
annotated. Our findings support that leaf-associated bacterial populations are
seasonally dynamic and responsive to host cues.
notes: ''
fundingSources:
fundingSources:
- Some award ABC
- Some award XYZ
contributors:
Expand Down Expand Up @@ -281,6 +281,14 @@ input:
ecosystem_subtype: Leaf
ecosystem_category: Terrestrial
specific_ecosystem: Phyllosphere
air_data:
- samp_name: "4"
analysis_type: metagenomics_long_read
- samp_name: "5"
- samp_name: "6"
jgi_mg_lr_data:
- samp_name: "4",
- analysis_type: metagenomics_long_read
status: in-progress
id: d32b5eb6-71e8-4c98-8a57-3e64d05718b4
author_orcid: 0000-0002-7705-343X
Expand Down Expand Up @@ -1028,7 +1036,7 @@ input:
roles:
- Principal Investigator
description: This is a test submission
fundingSources:
fundingSources:
- Some award ABC
- Some award XYZ
linkOutWebpage:
Expand Down

0 comments on commit f2c0593

Please sign in to comment.