-
Notifications
You must be signed in to change notification settings - Fork 24
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch '99-enable-reference-spectra-parsing-from-sdf-files' int…
…o 'master' Resolve "Enable reference spectra parsing from .sdf files" Closes #99 See merge request bright-giant/sirius/sirius-libs!72
- Loading branch information
Showing
11 changed files
with
492 additions
and
75 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
133 changes: 133 additions & 0 deletions
133
io/src/main/java/de/unijena/bioinf/babelms/sdf/SdfExperimentParser.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
package de.unijena.bioinf.babelms.sdf; | ||
|
||
import de.unijena.bioinf.ChemistryBase.chem.InChI; | ||
import de.unijena.bioinf.ChemistryBase.ms.Ms2Experiment; | ||
import de.unijena.bioinf.ChemistryBase.ms.MutableMs2Experiment; | ||
import de.unijena.bioinf.ChemistryBase.ms.SpectrumFileSource; | ||
import de.unijena.bioinf.ChemistryBase.ms.utils.SimpleSpectrum; | ||
import de.unijena.bioinf.ChemistryBase.utils.FileUtils; | ||
import de.unijena.bioinf.babelms.Parser; | ||
import de.unijena.bioinf.babelms.intermediate.ExperimentData; | ||
import de.unijena.bioinf.babelms.intermediate.ExperimentDataParser; | ||
import io.github.dan2097.jnainchi.InchiFlag; | ||
import io.github.dan2097.jnainchi.InchiStatus; | ||
import lombok.extern.slf4j.Slf4j; | ||
import org.openscience.cdk.exception.CDKException; | ||
import org.openscience.cdk.inchi.InChIGenerator; | ||
import org.openscience.cdk.inchi.InChIGeneratorFactory; | ||
import org.openscience.cdk.interfaces.IAtomContainer; | ||
import org.openscience.cdk.silent.SilentChemObjectBuilder; | ||
import org.openscience.cdk.smiles.SmilesGenerator; | ||
|
||
import java.io.BufferedReader; | ||
import java.io.IOException; | ||
import java.io.InputStream; | ||
import java.io.InputStreamReader; | ||
import java.net.URI; | ||
|
||
import static de.unijena.bioinf.ChemistryBase.chem.InChIs.isStandardInchi; | ||
import static de.unijena.bioinf.ChemistryBase.chem.InChIs.newInChI; | ||
|
||
@Slf4j | ||
public class SdfExperimentParser implements Parser<Ms2Experiment> { | ||
|
||
InputStream lastSeenInputStream = null; | ||
BufferedReader lastWrappingReader = null; | ||
|
||
@Override | ||
public Ms2Experiment parse(BufferedReader reader, URI source) throws IOException { | ||
IteratingSDFReader sdfReader = new IteratingSDFReader(reader, SilentChemObjectBuilder.getInstance()); | ||
if (sdfReader.hasNext()) { | ||
IAtomContainer sdfData = sdfReader.next(); | ||
ExperimentData data = extractData(sdfData); | ||
MutableMs2Experiment experiment = new ExperimentDataParser().parse(data); | ||
experiment.setAnnotation(SpectrumFileSource.class, new SpectrumFileSource(source)); | ||
return experiment; | ||
} | ||
return null; | ||
} | ||
|
||
private ExperimentData extractData(IAtomContainer sdfData) { | ||
ExperimentData data = ExperimentData.builder() | ||
.id(sdfData.getProperty("id") != null ? sdfData.getProperty("id") : sdfData.getProperty("name")) | ||
.spectrum(parseSpectrum(sdfData.getProperty("mass spectral peaks"))) | ||
.spectrumLevel(sdfData.getProperty("spectrum type")) | ||
.splash(sdfData.getProperty("splash")) | ||
.precursorMz(sdfData.getProperty("precursor m/z")) | ||
.precursorIonType(sdfData.getProperty("precursor type")) | ||
.instrumentation(sdfData.getProperty("instrument") + " " + sdfData.getProperty("instrument type")) | ||
.collisionEnergy(sdfData.getProperty("collision energy")) | ||
.compoundName(sdfData.getProperty("name")) | ||
.molecularFormula(sdfData.getProperty("formula")) | ||
.build(); | ||
|
||
fillInchi(data, sdfData); | ||
fillSmiles(data, sdfData); | ||
return data; | ||
} | ||
|
||
private SimpleSpectrum parseSpectrum(String peaksStr) { | ||
if (peaksStr == null) { | ||
return null; | ||
} | ||
String[] peaks = peaksStr.split("\n"); | ||
double[] masses = new double[peaks.length]; | ||
double[] intensities = new double[peaks.length]; | ||
for (int i = 0; i < peaks.length; i++) { | ||
String[] pair = peaks[i].split(" "); | ||
masses[i] = Double.parseDouble(pair[0]); | ||
intensities[i] = Double.parseDouble(pair[1]); | ||
} | ||
return new SimpleSpectrum(masses, intensities); | ||
} | ||
|
||
private void fillInchi(ExperimentData data, IAtomContainer sdfData) { | ||
try { | ||
InChI inchi = getInchi(sdfData); | ||
if (inchi != null) { | ||
data.setInchi(inchi.in3D); | ||
data.setInchiKey(inchi.key); | ||
} | ||
} catch (CDKException e) { | ||
log.warn("Could not create InChI from sdf data", e); | ||
} | ||
} | ||
|
||
/** | ||
* Temporary copy-paste from InChISMILESUtils. Cannot use it directly because of a circular dependency, todo issue #114 | ||
*/ | ||
public static InChI getInchi(IAtomContainer atomContainer) throws CDKException { | ||
// this will create a standard inchi, see: https://egonw.github.io/cdkbook/inchi.html | ||
InChIGenerator inChIGenerator = InChIGeneratorFactory.getInstance().getInChIGenerator(atomContainer, InchiFlag.SNon); //removing stereoInformation produces much less warnings, including 'Omitted undefined stereo' | ||
InchiStatus state = inChIGenerator.getStatus(); | ||
if (state != InchiStatus.ERROR) { | ||
if (state == InchiStatus.WARNING) | ||
log.debug("Warning while reading AtomContainer with title '" + atomContainer.getTitle() + "' -> " + inChIGenerator.getMessage()); | ||
String inchi = inChIGenerator.getInchi(); | ||
if (inchi == null) return null; | ||
if (!isStandardInchi(inchi)) | ||
throw new IllegalStateException("Non standard Inchi was created ('" + inchi + "'), which is not expected behaviour. Please submit a bug report!"); | ||
String key = inChIGenerator.getInchiKey(); | ||
return newInChI(key, inchi); | ||
} else { | ||
throw new CDKException("Error while creating InChI. State: '" + state + "'. Message: '" + inChIGenerator.getMessage() + "'."); | ||
} | ||
} | ||
|
||
private void fillSmiles(ExperimentData data, IAtomContainer sdfData) { | ||
try { | ||
data.setSmiles(SmilesGenerator.unique().create(sdfData)); | ||
} catch (CDKException e) { | ||
log.warn("Could not create smiles from sdf data", e); | ||
} | ||
} | ||
|
||
@Override | ||
public Ms2Experiment parse(InputStream inputStream, URI source) throws IOException { | ||
if (inputStream != lastSeenInputStream) { | ||
lastSeenInputStream = inputStream; | ||
lastWrappingReader = FileUtils.ensureBuffering(new InputStreamReader(inputStream)); | ||
} | ||
return parse(lastWrappingReader, source); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
42 changes: 42 additions & 0 deletions
42
io/src/test/java/de/unijena/bioinf/babelms/sdf/SdfExperimentParserTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
package de.unijena.bioinf.babelms.sdf; | ||
|
||
import de.unijena.bioinf.ChemistryBase.chem.InChI; | ||
import de.unijena.bioinf.ChemistryBase.chem.Smiles; | ||
import de.unijena.bioinf.ChemistryBase.ms.*; | ||
import org.junit.jupiter.api.Test; | ||
|
||
import java.io.IOException; | ||
|
||
import static de.unijena.bioinf.babelms.ParserTestUtils.loadExperiment; | ||
import static org.junit.jupiter.api.Assertions.assertEquals; | ||
import static org.junit.jupiter.api.Assertions.assertTrue; | ||
|
||
class SdfExperimentParserTest { | ||
|
||
@Test | ||
void parseTest() throws IOException { | ||
Ms2Experiment experiment = loadExperiment("sdf/UP000538.sdf"); | ||
|
||
assertTrue(experiment.getMs1Spectra().isEmpty()); | ||
assertEquals(1, experiment.getMs2Spectra().size()); | ||
|
||
Ms2Spectrum<Peak> spectrum = experiment.getMs2Spectra().get(0); | ||
assertEquals(1, spectrum.size()); | ||
assertEquals(349.1844, spectrum.getMzAt(0), 1e-9); | ||
assertEquals(100, spectrum.getIntensityAt(0), 1e-9); | ||
assertEquals(393.2095, spectrum.getPrecursorMz(), 1e-9); | ||
|
||
assertEquals("Octaethylene glycol", experiment.getName()); | ||
assertEquals("C16H34O9", experiment.getMolecularFormula().toString()); | ||
|
||
assertEquals(MsInstrumentation.Instrument.ORBI, experiment.getAnnotation(MsInstrumentation.class).orElseThrow()); | ||
|
||
assertEquals("InChI=1S/C16H34O9/c17-1-3-19-5-7-21-9-11-23-13-15-25-16-14-24-12-10-22-8-6-20-4-2-18/h17-18H,1-16H2", experiment.getAnnotation(InChI.class).orElseThrow().in3D); | ||
assertEquals("GLZWNFNQMJAZGY-UHFFFAOYSA-N", experiment.getAnnotation(InChI.class).orElseThrow().key); | ||
|
||
assertEquals("OCCOCCOCCOCCOCCOCCOCCOCCO", experiment.getAnnotation(Smiles.class).orElseThrow().smiles); | ||
assertEquals("splash10-0002-0009000000-d849df50ad373f4c881c", experiment.getAnnotation(Splash.class).orElseThrow().getSplash()); | ||
|
||
assertEquals("[M + Na]+", experiment.getPrecursorIonType().toString()); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
ACCESSION: MSBNK-HBM4EU-HB002845 | ||
RECORD_TITLE: Chlorotoluron-OH-desmethyl (TENTATIVE); LC-ESI-QFT; MS2; CE: 30%; R=70000; [M+H]+ | ||
DATE: 2021.02.23 | ||
AUTHORS: Carolin Huber, Tobias Schulze, Martin Krauss, Department of Effect-Directed Analysis, Helmholtz Centre for Environmental Research - UFZ GmbH, Leipzig, Germany | ||
LICENSE: CC0 | ||
COPYRIGHT: Copyright (C) 2021 Helmholtz Centre for Environmental Research - UFZ, Leipzig, Germany | ||
PUBLICATION: Huber C, Mueller E, Schulze T, Brack W, Krauss M, Improving the Screening Analysis of Pesticide Metabolites in Human Biomonitoring by Combining High-Throughput In Vitro Incubation and Automated LC–HRMS Data Processing, Analytical Chemistry, https://doi.org/10.1021/acs.analchem.1c00972 | ||
COMMENT: HBM4EU - science and policy for a healthy future (https://www.hbm4eu.eu) | ||
COMMENT: COMMENT: CONFIDENCE: Tentative structure, with evidences on substitutes (Level 3b) | ||
COMMENT: COMMENT: generated by human liver S9 incubation | ||
COMMENT: Chlorotoluron_OH-desmethyl_30eV.txt | ||
CH$NAME: Chlorotoluron-OH-desmethyl | ||
CH$COMPOUND_CLASS: N/A; Biotransformation Product | ||
CH$FORMULA: C9H11ClN2O2 | ||
CH$EXACT_MASS: 214.05 | ||
CH$SMILES: C(*)C1=C(c(*)=C(C(*)=C(*)1)NC(=O)N(C(*)))Cl * = [OH (n=1) & H (n=4)] | ||
CH$IUPAC: N/A | ||
AC$INSTRUMENT: Q Exactive Plus Orbitrap Thermo Scientific | ||
AC$INSTRUMENT_TYPE: LC-ESI-QFT | ||
AC$MASS_SPECTROMETRY: MS_TYPE MS2 | ||
AC$MASS_SPECTROMETRY: ION_MODE POSITIVE | ||
AC$MASS_SPECTROMETRY: IONIZATION ESI | ||
AC$MASS_SPECTROMETRY: FRAGMENTATION_MODE HCD | ||
AC$MASS_SPECTROMETRY: COLLISION_ENERGY 30% (nominal) | ||
AC$MASS_SPECTROMETRY: RESOLUTION 70000 | ||
AC$CHROMATOGRAPHY: COLUMN_NAME Waters UPLC BEH C18 1.7 um 2.1 mm x 100 mm with pre-column | ||
AC$CHROMATOGRAPHY: FLOW_GRADIENT 100/0 at 0 min, 0/100 at 15 min, 0/100 at 21 min, 100/0 at 22 min, 100/0 at 30 min | ||
AC$CHROMATOGRAPHY: FLOW_RATE 300 uL/min | ||
AC$CHROMATOGRAPHY: RETENTION_TIME 8.54 min | ||
AC$CHROMATOGRAPHY: SOLVENT A water with 1% ammonium carbonate 1M | ||
AC$CHROMATOGRAPHY: SOLVENT B methanol with 1% ammonium carbonate 1M and 5% water | ||
MS$FOCUSED_ION: PRECURSOR_M/Z 215.0578 | ||
MS$FOCUSED_ION: PRECURSOR_TYPE [M+H]+ | ||
MS$DATA_PROCESSING: COMMENT Peaks removed that cannot be explained by GenForm and Molecular Formula (5 ppm) | ||
MS$DATA_PROCESSING: WHOLE mzR | ||
PK$SPLASH: splash10-0006-9600000000-16611426f75bfbd952c5 | ||
PK$NUM_PEAK: 20 | ||
PK$PEAK: m/z int. rel.int. | ||
58.029 601491.8 68 | ||
65.0386 21341.5 2 | ||
66.0466 24528.2 3 | ||
77.0385 18312 2 | ||
85.0284 16591 2 | ||
92.0494 153550.8 17 | ||
93.0573 8843257 999 | ||
98.9996 22119.4 2 | ||
104.0494 29876.4 3 | ||
105.0574 70014.1 8 | ||
110.0602 21156.3 2 | ||
113.0152 38686.4 4 | ||
122.9997 160015.2 18 | ||
126.0105 60570.7 7 | ||
128.0262 4926958.7 557 | ||
129.0101 32821.9 4 | ||
139.0059 39133.8 4 | ||
140.0262 1401638.5 158 | ||
154.0056 55904.7 6 | ||
158.0368 658013.5 74 | ||
// | ||
|
||
ACCESSION: MSBNK-HBM4EU-HB001222 | ||
RECORD_TITLE: Metoclopramide; LC-ESI-ITFT; MS2; CE: 55%; R=15000; [M+H]+ | ||
DATE: 2018.09.08 | ||
AUTHORS: Tobias Schulze, Carolin Huber, Martin Krauss, Department of Effect-Directed Analysis, Helmholtz Centre for Environmental Research GmbH - UFZ, Leipzig, Germany | ||
LICENSE: CC0 | ||
COPYRIGHT: Copyright (C) 2018 | ||
PUBLICATION: Oberacher H, Sasse M, Antignac J-P, Guitton Y, Debrauwer L, Jamin E L, Schulze T, Krauss M, Covaci A, Caballero-Casero N, Rosseau K, Damont A, Fenaille F, Lamoree M, Schymanski E, A European proposal for quality control and quality assurance of tandem mass spectral libraries, Environmental Sciences Europe, https://doi.org/10.1186/s12302-020-00314-9 | ||
COMMENT: CONFIDENCE Reference Standard (Level 1) | ||
COMMENT: HBM4EU - science and policy for a healthy future (https://www.hbm4eu.eu) | ||
CH$NAME: Metoclopramide | ||
CH$NAME: 4-amino-5-chloro-N-[2-(diethylamino)ethyl]-2-methoxybenzamide | ||
CH$COMPOUND_CLASS: N/A; Environmental Standard | ||
CH$FORMULA: C14H22ClN3O2 | ||
CH$EXACT_MASS: 299.1401 | ||
CH$SMILES: CCN(CC)CCNC(=O)C1=CC(=C(C=C1OC)N)Cl | ||
CH$IUPAC: InChI=1S/C14H22ClN3O2/c1-4-18(5-2)7-6-17-14(19)10-8-11(15)12(16)9-13(10)20-3/h8-9H,4-7,16H2,1-3H3,(H,17,19) | ||
CH$LINK: CAS 364-62-5 | ||
CH$LINK: CHEBI 107736 | ||
CH$LINK: KEGG D00726 | ||
CH$LINK: PUBCHEM CID:4168 | ||
CH$LINK: INCHIKEY TTWJBBZEZQICBI-UHFFFAOYSA-N | ||
CH$LINK: CHEMSPIDER 4024 | ||
AC$INSTRUMENT: LTQ Orbitrap XL Thermo Scientific | ||
AC$INSTRUMENT_TYPE: LC-ESI-ITFT | ||
AC$MASS_SPECTROMETRY: MS_TYPE MS2 | ||
AC$MASS_SPECTROMETRY: ION_MODE POSITIVE | ||
AC$MASS_SPECTROMETRY: IONIZATION ESI | ||
AC$MASS_SPECTROMETRY: FRAGMENTATION_MODE CID | ||
AC$MASS_SPECTROMETRY: COLLISION_ENERGY 55% (nominal) | ||
AC$MASS_SPECTROMETRY: RESOLUTION 15000 | ||
AC$CHROMATOGRAPHY: COLUMN_NAME Kinetex Evo C18 2.6 um 50 x 2.1 mm | ||
AC$CHROMATOGRAPHY: FLOW_GRADIENT 95/5 at 0 min, 95/5 at 1 min, 0/100 at 13 min, 0/100 at 24 min, 95/5 at 24.3 min, 95/5 at 32 min | ||
AC$CHROMATOGRAPHY: FLOW_RATE 300 uL/min | ||
AC$CHROMATOGRAPHY: RETENTION_TIME 2.916 min | ||
AC$CHROMATOGRAPHY: SOLVENT A water with 0.1% formic acid | ||
AC$CHROMATOGRAPHY: SOLVENT B methanol with 0.1% formic acid | ||
MS$FOCUSED_ION: BASE_PEAK 300.1474 | ||
MS$FOCUSED_ION: PRECURSOR_M/Z 300.1473 | ||
MS$FOCUSED_ION: PRECURSOR_TYPE [M+H]+ | ||
MS$DATA_PROCESSING: RECALIBRATE loess on assigned fragments and MS1 | ||
MS$DATA_PROCESSING: REANALYZE Peaks with additional N2/O included | ||
MS$DATA_PROCESSING: WHOLE RMassBank 2.9.1 | ||
PK$SPLASH: splash10-004i-0090000000-c2ed7280ad2eab977803 | ||
PK$ANNOTATION: m/z tentative_formula formula_count mass error(ppm) | ||
184.0161 C8H7ClNO2+ 1 184.016 0.79 | ||
212.0343 C9H9ClN2O2+ 1 212.0347 -1.97 | ||
227.0582 C10H12ClN2O2+ 1 227.0582 -0.03 | ||
PK$NUM_PEAK: 3 | ||
PK$PEAK: m/z int. rel.int. | ||
184.0161 94825.9 12 | ||
212.0343 9341.9 1 | ||
227.0582 7668302.5 999 | ||
// |
Oops, something went wrong.