Skip to content

Commit

Permalink
Merge branch '99-enable-reference-spectra-parsing-from-sdf-files' int…
Browse files Browse the repository at this point in the history
…o 'master'

Resolve "Enable reference spectra parsing from .sdf files"

Closes #99

See merge request bright-giant/sirius/sirius-libs!72
  • Loading branch information
Markus Fleischauer committed May 28, 2024
2 parents 82945f4 + 24b5ff9 commit ca5e395
Show file tree
Hide file tree
Showing 11 changed files with 492 additions and 75 deletions.
2 changes: 1 addition & 1 deletion io/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,6 @@ dependencies {
api project(':lcms')
api project(':elgordo')

for (module in ["data", "smiles", "formula"])
for (module in ["data", "smiles", "formula", "qsarmolecular", "bundle"])
api "org.openscience.cdk:cdk-$module:$cdk_version"
}
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import de.unijena.bioinf.babelms.msp.MSPExperimentParser;
import de.unijena.bioinf.babelms.mzml.MzMlExperimentParser;
import de.unijena.bioinf.babelms.mzml.MzXmlExperimentParser;
import de.unijena.bioinf.babelms.sdf.SdfExperimentParser;
import de.unijena.bioinf.babelms.txt.TxtExperimentParser;
import de.unijena.bioinf.ms.annotations.Ms2ExperimentAnnotation;
import de.unijena.bioinf.ms.properties.PropertyManager;
Expand Down Expand Up @@ -121,6 +122,7 @@ private static Map<String, Class<? extends Parser<Ms2Experiment>>> addKnownEndin
endings.put(".mblib", MassbankExperimentParser.class);
endings.put(".txt", TxtExperimentParser.class);
endings.put(".json", JsonExperimentParserDispatcher.class);
endings.put(".sdf", SdfExperimentParser.class);
return endings;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@
@Data
@Builder
public class ExperimentData {
/**
* Only for error reporting purposes
*/
private String id;

private SimpleSpectrum spectrum;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ public MutableMs2Experiment parse(ExperimentData data) {
protected void addSpectrum() {
SimpleSpectrum spectrum = data.getSpectrum();
if (spectrum == null) {
throw new RuntimeException("Spectrum is not set in record " + data.getId() + ".");
log.warn("Spectrum is missing in record " + data.getId() + ".");
return;
}

String spectrumLevel = data.getSpectrumLevel();
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
package de.unijena.bioinf.babelms.sdf;

import de.unijena.bioinf.ChemistryBase.chem.InChI;
import de.unijena.bioinf.ChemistryBase.ms.Ms2Experiment;
import de.unijena.bioinf.ChemistryBase.ms.MutableMs2Experiment;
import de.unijena.bioinf.ChemistryBase.ms.SpectrumFileSource;
import de.unijena.bioinf.ChemistryBase.ms.utils.SimpleSpectrum;
import de.unijena.bioinf.ChemistryBase.utils.FileUtils;
import de.unijena.bioinf.babelms.Parser;
import de.unijena.bioinf.babelms.intermediate.ExperimentData;
import de.unijena.bioinf.babelms.intermediate.ExperimentDataParser;
import io.github.dan2097.jnainchi.InchiFlag;
import io.github.dan2097.jnainchi.InchiStatus;
import lombok.extern.slf4j.Slf4j;
import org.openscience.cdk.exception.CDKException;
import org.openscience.cdk.inchi.InChIGenerator;
import org.openscience.cdk.inchi.InChIGeneratorFactory;
import org.openscience.cdk.interfaces.IAtomContainer;
import org.openscience.cdk.silent.SilentChemObjectBuilder;
import org.openscience.cdk.smiles.SmilesGenerator;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URI;

import static de.unijena.bioinf.ChemistryBase.chem.InChIs.isStandardInchi;
import static de.unijena.bioinf.ChemistryBase.chem.InChIs.newInChI;

@Slf4j
public class SdfExperimentParser implements Parser<Ms2Experiment> {

InputStream lastSeenInputStream = null;
BufferedReader lastWrappingReader = null;

@Override
public Ms2Experiment parse(BufferedReader reader, URI source) throws IOException {
IteratingSDFReader sdfReader = new IteratingSDFReader(reader, SilentChemObjectBuilder.getInstance());
if (sdfReader.hasNext()) {
IAtomContainer sdfData = sdfReader.next();
ExperimentData data = extractData(sdfData);
MutableMs2Experiment experiment = new ExperimentDataParser().parse(data);
experiment.setAnnotation(SpectrumFileSource.class, new SpectrumFileSource(source));
return experiment;
}
return null;
}

private ExperimentData extractData(IAtomContainer sdfData) {
ExperimentData data = ExperimentData.builder()
.id(sdfData.getProperty("id") != null ? sdfData.getProperty("id") : sdfData.getProperty("name"))
.spectrum(parseSpectrum(sdfData.getProperty("mass spectral peaks")))
.spectrumLevel(sdfData.getProperty("spectrum type"))
.splash(sdfData.getProperty("splash"))
.precursorMz(sdfData.getProperty("precursor m/z"))
.precursorIonType(sdfData.getProperty("precursor type"))
.instrumentation(sdfData.getProperty("instrument") + " " + sdfData.getProperty("instrument type"))
.collisionEnergy(sdfData.getProperty("collision energy"))
.compoundName(sdfData.getProperty("name"))
.molecularFormula(sdfData.getProperty("formula"))
.build();

fillInchi(data, sdfData);
fillSmiles(data, sdfData);
return data;
}

private SimpleSpectrum parseSpectrum(String peaksStr) {
if (peaksStr == null) {
return null;
}
String[] peaks = peaksStr.split("\n");
double[] masses = new double[peaks.length];
double[] intensities = new double[peaks.length];
for (int i = 0; i < peaks.length; i++) {
String[] pair = peaks[i].split(" ");
masses[i] = Double.parseDouble(pair[0]);
intensities[i] = Double.parseDouble(pair[1]);
}
return new SimpleSpectrum(masses, intensities);
}

private void fillInchi(ExperimentData data, IAtomContainer sdfData) {
try {
InChI inchi = getInchi(sdfData);
if (inchi != null) {
data.setInchi(inchi.in3D);
data.setInchiKey(inchi.key);
}
} catch (CDKException e) {
log.warn("Could not create InChI from sdf data", e);
}
}

/**
* Temporary copy-paste from InChISMILESUtils. Cannot use it directly because of a circular dependency, todo issue #114
*/
public static InChI getInchi(IAtomContainer atomContainer) throws CDKException {
// this will create a standard inchi, see: https://egonw.github.io/cdkbook/inchi.html
InChIGenerator inChIGenerator = InChIGeneratorFactory.getInstance().getInChIGenerator(atomContainer, InchiFlag.SNon); //removing stereoInformation produces much less warnings, including 'Omitted undefined stereo'
InchiStatus state = inChIGenerator.getStatus();
if (state != InchiStatus.ERROR) {
if (state == InchiStatus.WARNING)
log.debug("Warning while reading AtomContainer with title '" + atomContainer.getTitle() + "' -> " + inChIGenerator.getMessage());
String inchi = inChIGenerator.getInchi();
if (inchi == null) return null;
if (!isStandardInchi(inchi))
throw new IllegalStateException("Non standard Inchi was created ('" + inchi + "'), which is not expected behaviour. Please submit a bug report!");
String key = inChIGenerator.getInchiKey();
return newInChI(key, inchi);
} else {
throw new CDKException("Error while creating InChI. State: '" + state + "'. Message: '" + inChIGenerator.getMessage() + "'.");
}
}

private void fillSmiles(ExperimentData data, IAtomContainer sdfData) {
try {
data.setSmiles(SmilesGenerator.unique().create(sdfData));
} catch (CDKException e) {
log.warn("Could not create smiles from sdf data", e);
}
}

@Override
public Ms2Experiment parse(InputStream inputStream, URI source) throws IOException {
if (inputStream != lastSeenInputStream) {
lastSeenInputStream = inputStream;
lastWrappingReader = FileUtils.ensureBuffering(new InputStreamReader(inputStream));
}
return parse(lastWrappingReader, source);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ public class TxtExperimentParser implements Parser<Ms2Experiment> {

MassbankExperimentParser delegate = new MassbankExperimentParser();

InputStream lastSeenInputStream = null;
BufferedReader lastWrappingReader = null;

@Override
public Ms2Experiment parse(BufferedReader reader, URI source) throws IOException {
try {
Expand All @@ -26,6 +29,10 @@ public Ms2Experiment parse(BufferedReader reader, URI source) throws IOException

@Override
public Ms2Experiment parse(InputStream inputStream, URI source) throws IOException {
return parse(FileUtils.ensureBuffering(new InputStreamReader(inputStream)), source);
if (inputStream != lastSeenInputStream) {
lastSeenInputStream = inputStream;
lastWrappingReader = FileUtils.ensureBuffering(new InputStreamReader(inputStream));
}
return parse(lastWrappingReader, source);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
package de.unijena.bioinf.babelms.sdf;

import de.unijena.bioinf.ChemistryBase.chem.InChI;
import de.unijena.bioinf.ChemistryBase.chem.Smiles;
import de.unijena.bioinf.ChemistryBase.ms.*;
import org.junit.jupiter.api.Test;

import java.io.IOException;

import static de.unijena.bioinf.babelms.ParserTestUtils.loadExperiment;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;

class SdfExperimentParserTest {

@Test
void parseTest() throws IOException {
Ms2Experiment experiment = loadExperiment("sdf/UP000538.sdf");

assertTrue(experiment.getMs1Spectra().isEmpty());
assertEquals(1, experiment.getMs2Spectra().size());

Ms2Spectrum<Peak> spectrum = experiment.getMs2Spectra().get(0);
assertEquals(1, spectrum.size());
assertEquals(349.1844, spectrum.getMzAt(0), 1e-9);
assertEquals(100, spectrum.getIntensityAt(0), 1e-9);
assertEquals(393.2095, spectrum.getPrecursorMz(), 1e-9);

assertEquals("Octaethylene glycol", experiment.getName());
assertEquals("C16H34O9", experiment.getMolecularFormula().toString());

assertEquals(MsInstrumentation.Instrument.ORBI, experiment.getAnnotation(MsInstrumentation.class).orElseThrow());

assertEquals("InChI=1S/C16H34O9/c17-1-3-19-5-7-21-9-11-23-13-15-25-16-14-24-12-10-22-8-6-20-4-2-18/h17-18H,1-16H2", experiment.getAnnotation(InChI.class).orElseThrow().in3D);
assertEquals("GLZWNFNQMJAZGY-UHFFFAOYSA-N", experiment.getAnnotation(InChI.class).orElseThrow().key);

assertEquals("OCCOCCOCCOCCOCCOCCOCCOCCO", experiment.getAnnotation(Smiles.class).orElseThrow().smiles);
assertEquals("splash10-0002-0009000000-d849df50ad373f4c881c", experiment.getAnnotation(Splash.class).orElseThrow().getSplash());

assertEquals("[M + Na]+", experiment.getPrecursorIonType().toString());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import de.unijena.bioinf.ChemistryBase.ms.Ms2Experiment;
import de.unijena.bioinf.babelms.ParserTestUtils;
import de.unijena.bioinf.babelms.annotations.CompoundMetaData;
import org.junit.jupiter.api.Test;

import java.io.File;
Expand Down Expand Up @@ -47,4 +48,19 @@ public void testParseEmptyFile() throws IOException {
assertNull(parser.parse(stream, uri));
}
}

@Test
public void testMultipleRecordsInFile() throws IOException {
File input = ParserTestUtils.getTestFile("massbank/multirecord.txt");
URI uri = input.toURI();
TxtExperimentParser parser = new TxtExperimentParser();
try (FileInputStream stream = new FileInputStream(input)) {
Ms2Experiment firstCall = parser.parse(stream, uri);
assertEquals("MSBNK-HBM4EU-HB002845", firstCall.getAnnotation(CompoundMetaData.class).orElseThrow().getCompoundId());

Ms2Experiment secondCall = parser.parse(stream, uri);
assertEquals("MSBNK-HBM4EU-HB001222", secondCall.getAnnotation(CompoundMetaData.class).orElseThrow().getCompoundId());
}
}
}

113 changes: 113 additions & 0 deletions io/src/test/resources/massbank/multirecord.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
ACCESSION: MSBNK-HBM4EU-HB002845
RECORD_TITLE: Chlorotoluron-OH-desmethyl (TENTATIVE); LC-ESI-QFT; MS2; CE: 30%; R=70000; [M+H]+
DATE: 2021.02.23
AUTHORS: Carolin Huber, Tobias Schulze, Martin Krauss, Department of Effect-Directed Analysis, Helmholtz Centre for Environmental Research - UFZ GmbH, Leipzig, Germany
LICENSE: CC0
COPYRIGHT: Copyright (C) 2021 Helmholtz Centre for Environmental Research - UFZ, Leipzig, Germany
PUBLICATION: Huber C, Mueller E, Schulze T, Brack W, Krauss M, Improving the Screening Analysis of Pesticide Metabolites in Human Biomonitoring by Combining High-Throughput In Vitro Incubation and Automated LC–HRMS Data Processing, Analytical Chemistry, https://doi.org/10.1021/acs.analchem.1c00972
COMMENT: HBM4EU - science and policy for a healthy future (https://www.hbm4eu.eu)
COMMENT: COMMENT: CONFIDENCE: Tentative structure, with evidences on substitutes (Level 3b)
COMMENT: COMMENT: generated by human liver S9 incubation
COMMENT: Chlorotoluron_OH-desmethyl_30eV.txt
CH$NAME: Chlorotoluron-OH-desmethyl
CH$COMPOUND_CLASS: N/A; Biotransformation Product
CH$FORMULA: C9H11ClN2O2
CH$EXACT_MASS: 214.05
CH$SMILES: C(*)C1=C(c(*)=C(C(*)=C(*)1)NC(=O)N(C(*)))Cl * = [OH (n=1) & H (n=4)]
CH$IUPAC: N/A
AC$INSTRUMENT: Q Exactive Plus Orbitrap Thermo Scientific
AC$INSTRUMENT_TYPE: LC-ESI-QFT
AC$MASS_SPECTROMETRY: MS_TYPE MS2
AC$MASS_SPECTROMETRY: ION_MODE POSITIVE
AC$MASS_SPECTROMETRY: IONIZATION ESI
AC$MASS_SPECTROMETRY: FRAGMENTATION_MODE HCD
AC$MASS_SPECTROMETRY: COLLISION_ENERGY 30% (nominal)
AC$MASS_SPECTROMETRY: RESOLUTION 70000
AC$CHROMATOGRAPHY: COLUMN_NAME Waters UPLC BEH C18 1.7 um 2.1 mm x 100 mm with pre-column
AC$CHROMATOGRAPHY: FLOW_GRADIENT 100/0 at 0 min, 0/100 at 15 min, 0/100 at 21 min, 100/0 at 22 min, 100/0 at 30 min
AC$CHROMATOGRAPHY: FLOW_RATE 300 uL/min
AC$CHROMATOGRAPHY: RETENTION_TIME 8.54 min
AC$CHROMATOGRAPHY: SOLVENT A water with 1% ammonium carbonate 1M
AC$CHROMATOGRAPHY: SOLVENT B methanol with 1% ammonium carbonate 1M and 5% water
MS$FOCUSED_ION: PRECURSOR_M/Z 215.0578
MS$FOCUSED_ION: PRECURSOR_TYPE [M+H]+
MS$DATA_PROCESSING: COMMENT Peaks removed that cannot be explained by GenForm and Molecular Formula (5 ppm)
MS$DATA_PROCESSING: WHOLE mzR
PK$SPLASH: splash10-0006-9600000000-16611426f75bfbd952c5
PK$NUM_PEAK: 20
PK$PEAK: m/z int. rel.int.
58.029 601491.8 68
65.0386 21341.5 2
66.0466 24528.2 3
77.0385 18312 2
85.0284 16591 2
92.0494 153550.8 17
93.0573 8843257 999
98.9996 22119.4 2
104.0494 29876.4 3
105.0574 70014.1 8
110.0602 21156.3 2
113.0152 38686.4 4
122.9997 160015.2 18
126.0105 60570.7 7
128.0262 4926958.7 557
129.0101 32821.9 4
139.0059 39133.8 4
140.0262 1401638.5 158
154.0056 55904.7 6
158.0368 658013.5 74
//

ACCESSION: MSBNK-HBM4EU-HB001222
RECORD_TITLE: Metoclopramide; LC-ESI-ITFT; MS2; CE: 55%; R=15000; [M+H]+
DATE: 2018.09.08
AUTHORS: Tobias Schulze, Carolin Huber, Martin Krauss, Department of Effect-Directed Analysis, Helmholtz Centre for Environmental Research GmbH - UFZ, Leipzig, Germany
LICENSE: CC0
COPYRIGHT: Copyright (C) 2018
PUBLICATION: Oberacher H, Sasse M, Antignac J-P, Guitton Y, Debrauwer L, Jamin E L, Schulze T, Krauss M, Covaci A, Caballero-Casero N, Rosseau K, Damont A, Fenaille F, Lamoree M, Schymanski E, A European proposal for quality control and quality assurance of tandem mass spectral libraries, Environmental Sciences Europe, https://doi.org/10.1186/s12302-020-00314-9
COMMENT: CONFIDENCE Reference Standard (Level 1)
COMMENT: HBM4EU - science and policy for a healthy future (https://www.hbm4eu.eu)
CH$NAME: Metoclopramide
CH$NAME: 4-amino-5-chloro-N-[2-(diethylamino)ethyl]-2-methoxybenzamide
CH$COMPOUND_CLASS: N/A; Environmental Standard
CH$FORMULA: C14H22ClN3O2
CH$EXACT_MASS: 299.1401
CH$SMILES: CCN(CC)CCNC(=O)C1=CC(=C(C=C1OC)N)Cl
CH$IUPAC: InChI=1S/C14H22ClN3O2/c1-4-18(5-2)7-6-17-14(19)10-8-11(15)12(16)9-13(10)20-3/h8-9H,4-7,16H2,1-3H3,(H,17,19)
CH$LINK: CAS 364-62-5
CH$LINK: CHEBI 107736
CH$LINK: KEGG D00726
CH$LINK: PUBCHEM CID:4168
CH$LINK: INCHIKEY TTWJBBZEZQICBI-UHFFFAOYSA-N
CH$LINK: CHEMSPIDER 4024
AC$INSTRUMENT: LTQ Orbitrap XL Thermo Scientific
AC$INSTRUMENT_TYPE: LC-ESI-ITFT
AC$MASS_SPECTROMETRY: MS_TYPE MS2
AC$MASS_SPECTROMETRY: ION_MODE POSITIVE
AC$MASS_SPECTROMETRY: IONIZATION ESI
AC$MASS_SPECTROMETRY: FRAGMENTATION_MODE CID
AC$MASS_SPECTROMETRY: COLLISION_ENERGY 55% (nominal)
AC$MASS_SPECTROMETRY: RESOLUTION 15000
AC$CHROMATOGRAPHY: COLUMN_NAME Kinetex Evo C18 2.6 um 50 x 2.1 mm
AC$CHROMATOGRAPHY: FLOW_GRADIENT 95/5 at 0 min, 95/5 at 1 min, 0/100 at 13 min, 0/100 at 24 min, 95/5 at 24.3 min, 95/5 at 32 min
AC$CHROMATOGRAPHY: FLOW_RATE 300 uL/min
AC$CHROMATOGRAPHY: RETENTION_TIME 2.916 min
AC$CHROMATOGRAPHY: SOLVENT A water with 0.1% formic acid
AC$CHROMATOGRAPHY: SOLVENT B methanol with 0.1% formic acid
MS$FOCUSED_ION: BASE_PEAK 300.1474
MS$FOCUSED_ION: PRECURSOR_M/Z 300.1473
MS$FOCUSED_ION: PRECURSOR_TYPE [M+H]+
MS$DATA_PROCESSING: RECALIBRATE loess on assigned fragments and MS1
MS$DATA_PROCESSING: REANALYZE Peaks with additional N2/O included
MS$DATA_PROCESSING: WHOLE RMassBank 2.9.1
PK$SPLASH: splash10-004i-0090000000-c2ed7280ad2eab977803
PK$ANNOTATION: m/z tentative_formula formula_count mass error(ppm)
184.0161 C8H7ClNO2+ 1 184.016 0.79
212.0343 C9H9ClN2O2+ 1 212.0347 -1.97
227.0582 C10H12ClN2O2+ 1 227.0582 -0.03
PK$NUM_PEAK: 3
PK$PEAK: m/z int. rel.int.
184.0161 94825.9 12
212.0343 9341.9 1
227.0582 7668302.5 999
//
Loading

0 comments on commit ca5e395

Please sign in to comment.