Skip to content

Commit

Permalink
Merge pull request #741 from microbiomedata/handle-multiple-envs
Browse files Browse the repository at this point in the history
Add mapping for multiple environments
  • Loading branch information
pkalita-lbl authored Dec 6, 2024
2 parents 17a8dd8 + 266a043 commit bfeee05
Show file tree
Hide file tree
Showing 2 changed files with 144 additions and 17 deletions.
59 changes: 47 additions & 12 deletions nmdc_runtime/site/translation/submission_portal_translator.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,44 @@
import logging
import re
from datetime import datetime
from enum import Enum
from functools import lru_cache
from importlib import resources
from typing import Any, List, Optional, Union

from linkml_runtime import SchemaView
from linkml_runtime.linkml_model import SlotDefinition
from nmdc_schema import nmdc
from toolz import get_in, groupby, concat, valmap, dissoc
from toolz import concat, dissoc, get_in, groupby, valmap

from nmdc_runtime.site.translation.translator import JSON_OBJECT, Translator


BIOSAMPLE_UNIQUE_KEY_SLOT = "samp_name"


class EnvironmentPackage(Enum):
r"""
Enumeration of all possible environmental packages.
>>> EnvironmentPackage.AIR.value
'air'
>>> EnvironmentPackage.SEDIMENT.value
'sediment'
"""

AIR = "air"
BIOFILM = "microbial mat_biofilm"
BUILT_ENV = "built environment"
HCR_CORES = "hydrocarbon resources-cores"
HRC_FLUID_SWABS = "hydrocarbon resources-fluids_swabs"
HOST_ASSOCIATED = "host-associated"
MISC_ENVS = "miscellaneous natural or artificial environment"
PLANT_ASSOCIATED = "plant-associated"
SEDIMENT = "sediment"
SOIL = "soil"
WATER = "water"


@lru_cache
def _get_schema_view():
"""Return a SchemaView instance representing the NMDC schema"""
Expand Down Expand Up @@ -550,7 +573,6 @@ def _translate_biosample(
sample_data: List[JSON_OBJECT],
nmdc_biosample_id: str,
nmdc_study_id: str,
default_env_package: str,
) -> nmdc.Biosample:
"""Translate sample data from portal submission into an `nmdc:Biosample` object.
Expand All @@ -565,18 +587,23 @@ def _translate_biosample(
from each applicable submission portal tab
:param nmdc_biosample_id: Minted nmdc:Biosample identifier for the translated object
:param nmdc_study_id: Minted nmdc:Study identifier for the related Study
:param default_env_package: Default value for `env_package` slot
:return: nmdc:Biosample
"""
biosample_key = sample_data[0].get(BIOSAMPLE_UNIQUE_KEY_SLOT, "").strip()
env_idx = next(
(
i
for i, tab in enumerate(sample_data)
if tab.get("env_package") is not None
),
0,
)
biosample_key = sample_data[env_idx].get(BIOSAMPLE_UNIQUE_KEY_SLOT, "").strip()
slots = {
"id": nmdc_biosample_id,
"associated_studies": [nmdc_study_id],
"type": "nmdc:Biosample",
"name": sample_data[0].get("samp_name", "").strip(),
"env_package": nmdc.TextValue(
has_raw_value=default_env_package, type="nmdc:TextValue"
),
"name": sample_data[env_idx].get("samp_name", "").strip(),
"env_package": sample_data[env_idx].get("env_package"),
}
for tab in sample_data:
transformed_tab = self._transform_dict_for_class(tab, "Biosample")
Expand Down Expand Up @@ -613,9 +640,18 @@ def get_database(self) -> nmdc.Database:
]

sample_data = metadata_submission_data.get("sampleData", {})
package_name = metadata_submission_data["packageName"]
for key in sample_data.keys():
env = key.removesuffix("_data").upper()
try:
package_name = EnvironmentPackage[env].value
for sample in sample_data[key]:
sample["env_package"] = package_name
except KeyError:
pass

sample_data_by_id = groupby(
BIOSAMPLE_UNIQUE_KEY_SLOT, concat(sample_data.values())
BIOSAMPLE_UNIQUE_KEY_SLOT,
concat(sample_data.values()),
)
nmdc_biosample_ids = self._id_minter("nmdc:Biosample", len(sample_data_by_id))
sample_data_to_nmdc_biosample_ids = dict(
Expand All @@ -627,7 +663,6 @@ def get_database(self) -> nmdc.Database:
sample_data,
nmdc_biosample_id=sample_data_to_nmdc_biosample_ids[sample_data_id],
nmdc_study_id=nmdc_study_id,
default_env_package=package_name,
)
for sample_data_id, sample_data in sample_data_by_id.items()
if sample_data
Expand Down
102 changes: 97 additions & 5 deletions tests/test_data/test_submission_portal_translator_data.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
input:
metadata_submission:
metadata_submission:
packageName: plant-associated
packageName: ['plant-associated', 'air']
contextForm:
datasetDoi: doi:10.46936/10.25585/60000818
dataGenerated: true
Expand Down Expand Up @@ -31,8 +31,7 @@ input:
biosafetyLevel: ''
irbOrHipaa:
comments: ''
templates:
- plant-associated
templates: ['plant-associated', 'air', 'jgi_mg']
studyForm:
studyName: Seasonal activities of the phyllosphere microbiome of perennial crops
piName: Ashley Shade
Expand All @@ -57,7 +56,7 @@ input:
annotated. Our findings support that leaf-associated bacterial populations are
seasonally dynamic and responsive to host cues.
notes: ''
fundingSources:
fundingSources:
- Some award ABC
- Some award XYZ
contributors:
Expand Down Expand Up @@ -281,6 +280,36 @@ input:
ecosystem_subtype: Leaf
ecosystem_category: Terrestrial
specific_ecosystem: Phyllosphere
air_data:
- elev: 225
lat_lon: 50.586825 6.408977
samp_name: "wedw"
env_medium: abcd [ENVO:00001998]
geo_loc_name: "USA: Maryland, Bethesda"
analysis_type:
- metaproteomics
collection_date: 2021
env_broad_scale: ewdwed [ENVO:123]
env_local_scale: asdxasd [ENV:234]
samp_store_temp: "-80 Celsius"
jgi_mg_data:
- samp_name: "G6R2_MAIN_09MAY2016"
analysis_type:
- metagenomics
dna_concentration: 0.1
dna_cont_type: tube
dna_container_id: 00001
dna_dnase: "no"
dna_isolate_meth: phenol/chloroform extraction
dna_project_contact: Jane Doe
dna_samp_id: 00001
dna_sample_format: Ethanol
dna_sample_name: 00001
dna_seq_project: 00001
dna_seq_project_name: 00001
dna_seq_project_pi: Jane Doe
dna_volume: 10
proposal_dna: 00001
status: in-progress
id: d32b5eb6-71e8-4c98-8a57-3e64d05718b4
author_orcid: 0000-0002-7705-343X
Expand Down Expand Up @@ -933,6 +962,69 @@ output:
has_raw_value: UUID:c0c4a2b5-0382-450a-8728-a176fa438efe
analysis_type:
- metagenomics
dna_concentration: 0.1
dna_cont_type: tube
dna_container_id: 00001
dna_dnase: "no"
dna_isolate_meth: phenol/chloroform extraction
dna_project_contact: Jane Doe
dna_samp_id: 00001
dna_sample_format: Ethanol
dna_sample_name: 00001
dna_seq_project: 00001
dna_seq_project_name: 00001
dna_seq_project_pi: Jane Doe
dna_volume: 10
proposal_dna: 00001
- id: nmdc:bsm-00-7r7pn0r2
type: 'nmdc:Biosample'
name: 'wedw'
associated_studies:
- 'nmdc:sty-00-y0cq65zt'
env_broad_scale:
type: 'nmdc:ControlledIdentifiedTermValue'
has_raw_value: 'ewdwed [ENVO:123]'
term:
id: 'ENVO:123'
type: 'nmdc:OntologyClass'
name: 'ewdwed'
env_local_scale:
type: 'nmdc:ControlledIdentifiedTermValue'
has_raw_value: 'asdxasd [ENV:234]'
term:
id: 'ENV:234'
type: 'nmdc:OntologyClass'
name: 'asdxasd'
env_medium:
type: 'nmdc:ControlledIdentifiedTermValue'
has_raw_value: 'abcd [ENVO:00001998]'
term:
id: 'ENVO:00001998'
type: 'nmdc:OntologyClass'
name: 'abcd'
samp_name: 'wedw'
collection_date:
type: 'nmdc:TimestampValue'
has_raw_value: '2021'
elev: 225.0
env_package:
type: 'nmdc:TextValue'
has_raw_value: 'air'
geo_loc_name:
type: 'nmdc:TextValue'
has_raw_value: 'USA: Maryland, Bethesda'
lat_lon:
type: 'nmdc:GeolocationValue'
has_raw_value: '50.586825 6.408977'
latitude: 50.586825
longitude: 6.408977
samp_store_temp:
type: 'nmdc:QuantityValue'
has_raw_value: '-80 Celsius'
has_unit: 'Celsius'
has_numeric_value: -80.0
analysis_type:
- metaproteomics
study_set:
- id: nmdc:sty-00-y0cq65zt
type: nmdc:Study
Expand Down Expand Up @@ -1032,7 +1124,7 @@ input:
roles:
- Principal Investigator
description: This is a test submission
fundingSources:
fundingSources:
- Some award ABC
- Some award XYZ
linkOutWebpage:
Expand Down

0 comments on commit bfeee05

Please sign in to comment.