Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add mapping for multiple environments #741

Merged
merged 8 commits into from
Dec 6, 2024
Merged
59 changes: 47 additions & 12 deletions nmdc_runtime/site/translation/submission_portal_translator.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,44 @@
import logging
import re
from datetime import datetime
from enum import Enum
from functools import lru_cache
from importlib import resources
from typing import Any, List, Optional, Union

from linkml_runtime import SchemaView
from linkml_runtime.linkml_model import SlotDefinition
from nmdc_schema import nmdc
from toolz import get_in, groupby, concat, valmap, dissoc
from toolz import concat, dissoc, get_in, groupby, valmap

from nmdc_runtime.site.translation.translator import JSON_OBJECT, Translator


BIOSAMPLE_UNIQUE_KEY_SLOT = "samp_name"


class EnvironmentPackage(Enum):
r"""
Enumeration of all possible environmental packages.

>>> EnvironmentPackage.AIR.value
'air'
>>> EnvironmentPackage.SEDIMENT.value
'sediment'
"""

AIR = "air"
BIOFILM = "microbial mat_biofilm"
BUILT_ENV = "built environment"
HCR_CORES = "hydrocarbon resources-cores"
HRC_FLUID_SWABS = "hydrocarbon resources-fluids_swabs"
HOST_ASSOCIATED = "host-associated"
MISC_ENVS = "miscellaneous natural or artificial environment"
PLANT_ASSOCIATED = "plant-associated"
SEDIMENT = "sediment"
SOIL = "soil"
WATER = "water"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue here related to microbiomedata/submission-schema#186



@lru_cache
def _get_schema_view():
"""Return a SchemaView instance representing the NMDC schema"""
Expand Down Expand Up @@ -536,7 +559,6 @@ def _translate_biosample(
sample_data: List[JSON_OBJECT],
nmdc_biosample_id: str,
nmdc_study_id: str,
default_env_package: str,
) -> nmdc.Biosample:
"""Translate sample data from portal submission into an `nmdc:Biosample` object.

Expand All @@ -551,18 +573,23 @@ def _translate_biosample(
from each applicable submission portal tab
:param nmdc_biosample_id: Minted nmdc:Biosample identifier for the translated object
:param nmdc_study_id: Minted nmdc:Study identifier for the related Study
:param default_env_package: Default value for `env_package` slot
:return: nmdc:Biosample
"""
biosample_key = sample_data[0].get(BIOSAMPLE_UNIQUE_KEY_SLOT, "").strip()
env_idx = next(
(
i
for i, tab in enumerate(sample_data)
if tab.get("env_package") is not None
),
0,
)
biosample_key = sample_data[env_idx].get(BIOSAMPLE_UNIQUE_KEY_SLOT, "").strip()
slots = {
"id": nmdc_biosample_id,
"associated_studies": [nmdc_study_id],
"type": "nmdc:Biosample",
"name": sample_data[0].get("samp_name", "").strip(),
"env_package": nmdc.TextValue(
has_raw_value=default_env_package, type="nmdc:TextValue"
),
"name": sample_data[env_idx].get("samp_name", "").strip(),
"env_package": sample_data[env_idx].get("env_package"),
}
for tab in sample_data:
transformed_tab = self._transform_dict_for_class(tab, "Biosample")
Expand Down Expand Up @@ -599,9 +626,18 @@ def get_database(self) -> nmdc.Database:
]

sample_data = metadata_submission_data.get("sampleData", {})
package_name = metadata_submission_data["packageName"]
for key in sample_data.keys():
env = key.removesuffix("_data").upper()
try:
package_name = EnvironmentPackage[env].value
for sample in sample_data[key]:
sample["env_package"] = package_name
except KeyError:
pass

sample_data_by_id = groupby(
BIOSAMPLE_UNIQUE_KEY_SLOT, concat(sample_data.values())
BIOSAMPLE_UNIQUE_KEY_SLOT,
concat(sample_data.values()),
)
nmdc_biosample_ids = self._id_minter("nmdc:Biosample", len(sample_data_by_id))
sample_data_to_nmdc_biosample_ids = dict(
Expand All @@ -613,7 +649,6 @@ def get_database(self) -> nmdc.Database:
sample_data,
nmdc_biosample_id=sample_data_to_nmdc_biosample_ids[sample_data_id],
nmdc_study_id=nmdc_study_id,
default_env_package=package_name,
)
for sample_data_id, sample_data in sample_data_by_id.items()
if sample_data
Expand Down
102 changes: 97 additions & 5 deletions tests/test_data/test_submission_portal_translator_data.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
input:
metadata_submission:
metadata_submission:
packageName: plant-associated
packageName: ['plant-associated', 'air']
contextForm:
datasetDoi: doi:10.46936/10.25585/60000818
dataGenerated: true
Expand Down Expand Up @@ -31,8 +31,7 @@ input:
biosafetyLevel: ''
irbOrHipaa:
comments: ''
templates:
- plant-associated
templates: ['plant-associated', 'air', 'jgi_mg']
studyForm:
studyName: Seasonal activities of the phyllosphere microbiome of perennial crops
piName: Ashley Shade
Expand All @@ -57,7 +56,7 @@ input:
annotated. Our findings support that leaf-associated bacterial populations are
seasonally dynamic and responsive to host cues.
notes: ''
fundingSources:
fundingSources:
- Some award ABC
- Some award XYZ
contributors:
Expand Down Expand Up @@ -281,6 +280,36 @@ input:
ecosystem_subtype: Leaf
ecosystem_category: Terrestrial
specific_ecosystem: Phyllosphere
air_data:
- elev: 225
lat_lon: 50.586825 6.408977
samp_name: "wedw"
env_medium: abcd [ENVO:00001998]
geo_loc_name: "USA: Maryland, Bethesda"
analysis_type:
- metaproteomics
collection_date: 2021
env_broad_scale: ewdwed [ENVO:123]
env_local_scale: asdxasd [ENV:234]
samp_store_temp: "-80 Celsius"
jgi_mg_data:
- samp_name: "G6R2_MAIN_09MAY2016"
analysis_type:
- metagenomics
dna_concentration: 0.1
dna_cont_type: tube
dna_container_id: 00001
dna_dnase: "no"
dna_isolate_meth: phenol/chloroform extraction
dna_project_contact: Jane Doe
dna_samp_id: 00001
dna_sample_format: Ethanol
dna_sample_name: 00001
dna_seq_project: 00001
dna_seq_project_name: 00001
dna_seq_project_pi: Jane Doe
dna_volume: 10
proposal_dna: 00001
status: in-progress
id: d32b5eb6-71e8-4c98-8a57-3e64d05718b4
author_orcid: 0000-0002-7705-343X
Expand Down Expand Up @@ -933,6 +962,69 @@ output:
has_raw_value: UUID:c0c4a2b5-0382-450a-8728-a176fa438efe
analysis_type:
- metagenomics
dna_concentration: 0.1
dna_cont_type: tube
dna_container_id: 00001
dna_dnase: "no"
dna_isolate_meth: phenol/chloroform extraction
dna_project_contact: Jane Doe
dna_samp_id: 00001
dna_sample_format: Ethanol
dna_sample_name: 00001
dna_seq_project: 00001
dna_seq_project_name: 00001
dna_seq_project_pi: Jane Doe
dna_volume: 10
proposal_dna: 00001
- id: nmdc:bsm-00-7r7pn0r2
type: 'nmdc:Biosample'
name: 'wedw'
associated_studies:
- 'nmdc:sty-00-y0cq65zt'
env_broad_scale:
type: 'nmdc:ControlledIdentifiedTermValue'
has_raw_value: 'ewdwed [ENVO:123]'
term:
id: 'ENVO:123'
type: 'nmdc:OntologyClass'
name: 'ewdwed'
env_local_scale:
type: 'nmdc:ControlledIdentifiedTermValue'
has_raw_value: 'asdxasd [ENV:234]'
term:
id: 'ENV:234'
type: 'nmdc:OntologyClass'
name: 'asdxasd'
env_medium:
type: 'nmdc:ControlledIdentifiedTermValue'
has_raw_value: 'abcd [ENVO:00001998]'
term:
id: 'ENVO:00001998'
type: 'nmdc:OntologyClass'
name: 'abcd'
samp_name: 'wedw'
collection_date:
type: 'nmdc:TimestampValue'
has_raw_value: '2021'
elev: 225.0
env_package:
type: 'nmdc:TextValue'
has_raw_value: 'air'
geo_loc_name:
type: 'nmdc:TextValue'
has_raw_value: 'USA: Maryland, Bethesda'
lat_lon:
type: 'nmdc:GeolocationValue'
has_raw_value: '50.586825 6.408977'
latitude: 50.586825
longitude: 6.408977
samp_store_temp:
type: 'nmdc:QuantityValue'
has_raw_value: '-80 Celsius'
has_unit: 'Celsius'
has_numeric_value: -80.0
analysis_type:
- metaproteomics
study_set:
- id: nmdc:sty-00-y0cq65zt
type: nmdc:Study
Expand Down Expand Up @@ -1028,7 +1120,7 @@ input:
roles:
- Principal Investigator
description: This is a test submission
fundingSources:
fundingSources:
- Some award ABC
- Some award XYZ
linkOutWebpage:
Expand Down
Loading