Skip to content

Commit

Permalink
Merge branch '103-support-txt-ending-form-mass-bank-format' into 'mas…
Browse files Browse the repository at this point in the history
…ter'

Resolve "Support  .txt ending form mass bank format"

Closes #103

See merge request bright-giant/sirius/sirius-libs!67
  • Loading branch information
Markus Fleischauer committed May 23, 2024
2 parents 9e7293d + 7279c2f commit 5e2f866
Show file tree
Hide file tree
Showing 12 changed files with 185 additions and 26 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import de.unijena.bioinf.babelms.msp.MSPExperimentParser;
import de.unijena.bioinf.babelms.mzml.MzMlExperimentParser;
import de.unijena.bioinf.babelms.mzml.MzXmlExperimentParser;
import de.unijena.bioinf.babelms.txt.TxtExperimentParser;
import de.unijena.bioinf.ms.annotations.Ms2ExperimentAnnotation;
import de.unijena.bioinf.ms.properties.PropertyManager;
import org.jetbrains.annotations.NotNull;
Expand All @@ -47,7 +48,7 @@ public class MsExperimentParser {
protected static final Map<String, Class<? extends Parser<Ms2Experiment>>> KNOWN_ENDINGS = addKnownEndings();

// there is no good solution without writing the endings here explicitly (otherwise DESCRIPTION can not be used in annotations)
public static final String DESCRIPTION = ".ms, .mgf, .mzxml, .mzml, .cef, .msp, .mat, .mb, .mblib, .json (GNPS, MoNA), .zip";
public static final String DESCRIPTION = ".ms, .mgf, .mzxml, .mzml, .cef, .msp, .mat, .mb, .mblib, .txt (MassBank), .json (GNPS, MoNA), .zip";

/**
* This postprocessor annotates Parameter configs to the {@link Ms2Experiment}. If {@link InputFileConfig} is given
Expand Down Expand Up @@ -118,7 +119,7 @@ private static Map<String, Class<? extends Parser<Ms2Experiment>>> addKnownEndin
endings.put(".mat", MSPExperimentParser.class);
endings.put(".mb", MassbankExperimentParser.class);
endings.put(".mblib", MassbankExperimentParser.class);
endings.put(".txt", MassbankExperimentParser.class);
endings.put(".txt", TxtExperimentParser.class);
endings.put(".json", JsonExperimentParserDispatcher.class);
return endings;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,15 @@
import de.unijena.bioinf.babelms.MsExperimentParser;
import de.unijena.bioinf.babelms.ReportingInputStream;
import lombok.Getter;
import lombok.extern.slf4j.Slf4j;
import org.jetbrains.annotations.NotNull;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.util.*;
import java.util.function.BiConsumer;

@Slf4j
public class InputResourceParsingIterator implements Iterator<Ms2Experiment> {

private final Iterator<InputResource<?>> inputResources;
Expand All @@ -47,6 +49,9 @@ public class InputResourceParsingIterator implements Iterator<Ms2Experiment> {

List<BiConsumer<Integer, Long>> listeners = new ArrayList<>();

@Getter
private final Map<String, Exception> parsingErrors = new LinkedHashMap<>();


public InputResourceParsingIterator(@NotNull Iterable<InputResource<?>> inputResources) {
this(inputResources, new MsExperimentParser());
Expand All @@ -64,8 +69,8 @@ public InputResourceParsingIterator(@NotNull Iterator<InputResource<?>> inputRes
public boolean hasNext() {
if (buffer.isEmpty()) {
while (inputResources.hasNext()) {
InputResource<?> next = inputResources.next();
try {
InputResource<?> next = inputResources.next();
String ext = next.getFileExt();
if (!parsers.containsKey(ext))
parsers.put(ext, parser.getParserByExt(ext));
Expand All @@ -76,12 +81,15 @@ public boolean hasNext() {
listeners.forEach(c -> c.accept(chunkRead, this.bytesRead));
});
try (CloseableIterator<Ms2Experiment> it = parsers.get(ext).parseIterator(new BufferedReader(new InputStreamReader(stream)), next.toUri())) {
it.forEachRemaining(buffer::add); //todo maybe one by sone safes memory
it.forEachRemaining(buffer::add); //todo maybe one by one saves memory
}
}
if (!buffer.isEmpty())
break;
} catch (Exception ignored) {
} catch (Exception e) {
String fileName = next.getFilename();
log.error("Error parsing " + fileName, e);
parsingErrors.put(next.getFilename(), e);
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@
import de.unijena.bioinf.ChemistryBase.ms.utils.SpectrumWithAdditionalFields;
import de.unijena.bioinf.babelms.CloseableIterator;
import de.unijena.bioinf.babelms.SpectralParser;
import lombok.extern.slf4j.Slf4j;
import org.jetbrains.annotations.Nullable;
import org.slf4j.LoggerFactory;

import java.io.BufferedReader;
import java.io.IOException;
Expand All @@ -42,6 +42,7 @@
/**
* Parser for Massbank MSP format
*/
@Slf4j
public class MassbankSpectralParser extends SpectralParser {
public static final String MULTILINE_INDENT = " ";

Expand All @@ -55,28 +56,40 @@ public CloseableIterator<? extends SpectrumWithAdditionalFields<Peak>> parseSpec
SpectrumWithAdditionalFields<Peak> parseSpectrum(BufferedReader reader) throws IOException {
List<String> peakAnnotationsCSV = new ArrayList<>();
AdditionalFields metaInfo = new AdditionalFields(false);
SpectrumWithAdditionalFields<Peak> spectrum = null;
String line;
boolean seenContent = false;

while (!"//".equals(line = reader.readLine())) {

if (line == null) {
if (seenContent) {
throw new RuntimeException("Unexpected end of stream in a MassBank file");
} else {
break;
}
}

while ((line = reader.readLine()) != null) {
if (line.isBlank())
continue;
seenContent = true;

if (line.startsWith(PK_ANNOTATION.k())) {
while ((line = reader.readLine()).startsWith(MULTILINE_INDENT))
while ((line = reader.readLine()).startsWith(MULTILINE_INDENT)) {
peakAnnotationsCSV.add(line.strip());
}
}

if (line.startsWith(PK_PEAK.k())) {
int peaks = metaInfo.getField(PK_NUM_PEAK.k()).map(Integer::parseInt).orElse(-1);
SpectrumWithAdditionalFields<Peak> spectrum;
{
double[] masses = new double[peaks];
double[] intensities = new double[peaks];

for (int i = 0; i < masses.length; ) {
line = reader.readLine();
if (line == null) {
LoggerFactory.getLogger(getClass()).warn("Unexpected end of peak list. Ended at '" + i + "/" + peaks + "'.");
log.warn("Unexpected end of peak list. Ended at '" + i + "/" + peaks + "'.");
break;
}
if (!line.isBlank()) {
Expand All @@ -90,7 +103,7 @@ SpectrumWithAdditionalFields<Peak> parseSpectrum(BufferedReader reader) throws I
spectrum = new SimpleSpectrumWithAdditionalFields(masses, intensities);

if (spectrum.isEmpty())
LoggerFactory.getLogger(getClass()).error("0 Peaks found in current Block, Returning empty spectrum with meta data");
log.error("0 Peaks found in current Block, Returning empty spectrum with meta data");
}

String msLevel = metaInfo.get(AC_MASS_SPECTROMETRY_MS_TYPE.k());
Expand All @@ -107,18 +120,14 @@ SpectrumWithAdditionalFields<Peak> parseSpectrum(BufferedReader reader) throws I
.orElseThrow(() -> new IOException("Could neither parse '" + MS_FOCUSED_ION_ION_TYPE.k() + "' nor '" + AC_MASS_SPECTROMETRY_ION_MODE.k() + "!"))
.getIonization());
}

spectrum.setAdditionalFields(metaInfo);
return spectrum;
}


withKeyValue(line, metaInfo::putIfAbsent);
}

LoggerFactory.getLogger("Unexpected end of Stream. No Peaks found! No spectrum returned.");
return null;

if (spectrum != null) {
spectrum.setAdditionalFields(metaInfo);
}
return spectrum;
}


Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
package de.unijena.bioinf.babelms.txt;

import de.unijena.bioinf.ChemistryBase.ms.Ms2Experiment;
import de.unijena.bioinf.ChemistryBase.utils.FileUtils;
import de.unijena.bioinf.babelms.Parser;
import de.unijena.bioinf.babelms.massbank.MassbankExperimentParser;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URI;

public class TxtExperimentParser implements Parser<Ms2Experiment> {

MassbankExperimentParser delegate = new MassbankExperimentParser();

@Override
public Ms2Experiment parse(BufferedReader reader, URI source) throws IOException {
try {
return delegate.parse(reader, source);
} catch (Exception e) {
throw new RuntimeException("Could not parse MassBank .txt file", e);
}
}

@Override
public Ms2Experiment parse(InputStream inputStream, URI source) throws IOException {
return parse(FileUtils.ensureBuffering(new InputStreamReader(inputStream)), source);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,12 @@

public class ParserTestUtils {

public static File getTestFile(String path) {
return new File(Objects.requireNonNull(ParserTestUtils.class.getClassLoader().getResource(path)).getFile());
}

public static Ms2Experiment loadExperiment(String file) throws IOException {
File input = new File(Objects.requireNonNull(ParserTestUtils.class.getClassLoader().getResource(file)).getFile());
File input = getTestFile(file);
return new MsExperimentParser().getParser(input).parseFromFile(input).get(0);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.Objects;

import static de.unijena.bioinf.babelms.ParserTestUtils.loadExperiment;
import static org.junit.jupiter.api.Assertions.assertEquals;
Expand Down Expand Up @@ -77,7 +76,7 @@ public void testParseSpectrum() throws IOException {

@Test
public void testRootArray() throws IOException {
File input = new File(Objects.requireNonNull(ParserTestUtils.class.getClassLoader().getResource("gnps/spectrum_array.json")).getFile());
File input = ParserTestUtils.getTestFile("gnps/spectrum_array.json");
List<Ms2Experiment> experiments = new MsExperimentParser().getParser(input).parseFromFile(input);

assertEquals(3, experiments.size());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import de.unijena.bioinf.ChemistryBase.ms.ft.model.IsotopeMs2Settings;
import de.unijena.bioinf.ChemistryBase.ms.utils.SimpleSpectrum;
import de.unijena.bioinf.babelms.MsExperimentParser;
import de.unijena.bioinf.babelms.ParserTestUtils;
import de.unijena.bioinf.ms.properties.PropertyManager;
import org.junit.Test;

Expand All @@ -23,7 +24,7 @@ public class MsParserTest{
@Test
public void readFile() throws Exception {
//values example file
File input = new File(this.getClass().getClassLoader().getResource("Adenosine.ms").getFile());
File input = ParserTestUtils.getTestFile("Adenosine.ms");
Ms2Experiment experiment = new MsExperimentParser().getParser(input).parseFromFile(input).get(0);

assertEquals("molecule name differs", experiment.getName(), "Adenosine");
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
package de.unijena.bioinf.babelms.txt;

import de.unijena.bioinf.ChemistryBase.ms.Ms2Experiment;
import de.unijena.bioinf.babelms.ParserTestUtils;
import org.junit.jupiter.api.Test;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.net.URI;

import static org.junit.jupiter.api.Assertions.*;

public class TxtExperimentParserTest {

@Test
public void testParseValidFile() throws IOException {
File input = ParserTestUtils.getTestFile("massbank/MSBNK-UFZ-UP000040.txt");
URI uri = input.toURI();
TxtExperimentParser parser = new TxtExperimentParser();
try (FileInputStream stream = new FileInputStream(input)) {
Ms2Experiment firstCall = parser.parse(stream, uri);
assertNotNull(firstCall);
assertEquals(2, firstCall.getMs2Spectra().get(0).size());

Ms2Experiment secondCall = parser.parse(stream, uri);
assertNull(secondCall);
}
}

@Test
public void testParseInvalidFile() throws IOException {
File input = ParserTestUtils.getTestFile("massbank/invalid.txt");
URI uri = input.toURI();
TxtExperimentParser parser = new TxtExperimentParser();
try (FileInputStream stream = new FileInputStream(input)) {
assertThrows(RuntimeException.class, () -> parser.parse(stream, uri));
}
}

@Test
public void testParseEmptyFile() throws IOException {
File input = ParserTestUtils.getTestFile("massbank/empty.txt");
URI uri = input.toURI();
TxtExperimentParser parser = new TxtExperimentParser();
try (FileInputStream stream = new FileInputStream(input)) {
assertNull(parser.parse(stream, uri));
}
}
}
50 changes: 50 additions & 0 deletions io/src/test/resources/massbank/MSBNK-UFZ-UP000040.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
ACCESSION: MSBNK-UFZ-UP000040
RECORD_TITLE: Triethylene glycol monomethyl ether; LC-ESI-ITFT; MS2; CE: 25%; R=15000; [M+H]+
DATE: 2019.07.31
AUTHORS: Liza-Marie Beckers, Werner Brack, Janek-Paul Dann, Martin Krauss, Erik Mueller, Tobias Schulze, Helmholtz Centre for Environmental Research GmbH - UFZ, Leipzig, Germany
LICENSE: CC0
COPYRIGHT: Copyright (C) 2019
PUBLICATION: Beckers L-M, Brack W, Dann JP, Krauss M, Mueller E, Schulze T, 2020. Unraveling longitudinal pollution patterns of organic micropollutants in a river by non-target screening and cluster analysis. Science of The Total Environment, https://doi.org/10.1016/j.scitotenv.2020.138388
COMMENT: CONFIDENCE Reference Standard (Level 1)
COMMENT: INTERNAL_ID 5
CH$NAME: Triethylene glycol monomethyl ether
CH$NAME: 2-[2-(2-methoxyethoxy)ethoxy]ethanol
CH$COMPOUND_CLASS: N/A; Environmental Standard
CH$FORMULA: C7H16O4
CH$EXACT_MASS: 164.1049
CH$SMILES: COCCOCCOCCO
CH$IUPAC: InChI=1S/C7H16O4/c1-9-4-5-11-7-6-10-3-2-8/h8H,2-7H2,1H3
CH$LINK: CAS 112-35-6
CH$LINK: CHEBI 84233
CH$LINK: PUBCHEM CID:8178
CH$LINK: INCHIKEY JLGLQAWTXXGVEM-UHFFFAOYSA-N
CH$LINK: CHEMSPIDER 7886
AC$INSTRUMENT: LTQ Orbitrap XL Thermo Scientific
AC$INSTRUMENT_TYPE: LC-ESI-ITFT
AC$MASS_SPECTROMETRY: MS_TYPE MS2
AC$MASS_SPECTROMETRY: ION_MODE POSITIVE
AC$MASS_SPECTROMETRY: IONIZATION ESI
AC$MASS_SPECTROMETRY: FRAGMENTATION_MODE HCD
AC$MASS_SPECTROMETRY: COLLISION_ENERGY 25 % (nominal)
AC$MASS_SPECTROMETRY: RESOLUTION 15000
AC$CHROMATOGRAPHY: COLUMN_NAME Kinetex Evo C18 2.6 um 50 x 2.1 mm
AC$CHROMATOGRAPHY: FLOW_GRADIENT 95/5 at 0 min, 95/5 at 1 min, 0/100 at 13 min, 0/100 at 24 min, 95/5 at 24.3 min, 95/5/0 at 32 min
AC$CHROMATOGRAPHY: FLOW_RATE 300 uL/min
AC$CHROMATOGRAPHY: RETENTION_TIME 1.410 min
AC$CHROMATOGRAPHY: SOLVENT A water with 0.1% formic acid
AC$CHROMATOGRAPHY: SOLVENT B methanol with 0.1% formic acid
MS$FOCUSED_ION: BASE_PEAK 165.1119
MS$FOCUSED_ION: PRECURSOR_M/Z 165.1121
MS$FOCUSED_ION: PRECURSOR_TYPE [M+H]+
MS$DATA_PROCESSING: RECALIBRATE loess on assigned fragments and MS1
MS$DATA_PROCESSING: REANALYZE Peaks with additional N2/O included
MS$DATA_PROCESSING: WHOLE RMassBank 2.12.0
PK$SPLASH: splash10-0zfr-5900000000-a3f9bbebe85d609d644c
PK$ANNOTATION: m/z tentative_formula formula_count mass error(ppm)
59.049 C3H7O+ 1 59.0491 -3.05
103.0753 C5H11O2+ 1 103.0754 -0.48
PK$NUM_PEAK: 2
PK$PEAK: m/z int. rel.int.
59.049 2330.9 661
103.0753 3518.3 999
//
Empty file.
2 changes: 2 additions & 0 deletions io/src/test/resources/massbank/invalid.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Not a massbank txt file
Should break parsing
Original file line number Diff line number Diff line change
Expand Up @@ -188,14 +188,18 @@ public void importSpectraFromResources(List<InputResource<?>> spectrumFiles) thr

addToSpectraBuffer(specs);
}
if (!iterator.getParsingErrors().isEmpty()) {
String files = "'" + String.join("', '", iterator.getParsingErrors().keySet()) + "'";
throw new RuntimeException("Following files could not be imported: " + files);
}
}

public void importStructuresFromSmileAndInChis(String smilesOrInChI) throws IOException {
throwIfShutdown();
importStructuresFromSmileAndInChis(smilesOrInChI, null, null);
}

public Optional<Molecule> importStructuresFromSmileAndInChis(@Nullable String smilesOrInChI, @Nullable String id, @Nullable String name) throws IOException {
public Optional<Molecule> importStructuresFromSmileAndInChis(@Nullable String smilesOrInChI, @Nullable String id, @Nullable String name) {
throwIfShutdown();
if (smilesOrInChI == null || smilesOrInChI.isBlank()) {
LoggerFactory.getLogger(getClass()).warn("No structure information given in Line ' " + smilesOrInChI + "\t" + id + "\t" + name + "'. Skipping!");
Expand Down Expand Up @@ -620,7 +624,7 @@ public static class Molecule {
private final Set<String> ids = new HashSet<>();
private String name = null;
@NotNull
private IAtomContainer container;
private final IAtomContainer container;

private Molecule(@NotNull IAtomContainer container, @NotNull Smiles smiles, @NotNull InChI inchi) {
this.container = container;
Expand Down Expand Up @@ -728,7 +732,7 @@ public static JJob<Boolean> makeImportToDatabaseJob(
) {
return new BasicJJob<Boolean>() {
CustomDatabaseImporter importer;
CustomDatabaseImporter.Listener l = listener;
final CustomDatabaseImporter.Listener l = listener;

@Override
protected Boolean compute() throws Exception {
Expand Down

0 comments on commit 5e2f866

Please sign in to comment.