Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Smiles import minor adaptions #13

Merged
merged 5 commits into from
Feb 22, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 35 additions & 19 deletions src/main/java/de/unijena/cheminf/mortar/model/io/Importer.java
Original file line number Diff line number Diff line change
Expand Up @@ -390,8 +390,10 @@ private void preprocessMoleculeSet(IAtomContainerSet aMoleculeSet) throws NullPo
//
//<editor-fold desc="protected methods" defaultstate="collapsed">
/**
* Imports a SMILES file. This method is able to parse different types of SMILES files, e.g. with and without header
* or with only one column or two (SMILES and name/ID, which is in which column is detected).
* Imports a SMILES file. This method is able to parse differently formatted SMILES files, e.g. with and without
* header or with one or two columns (SMILES and name/ID). If no name can be detected for a structure, the structure
* is assigned the name of the file extended with the index of the structure in the file as name.
* <br>
* Protected and not private for testing in class ImporterTest.
*
* @param aFile a SMILES codes-containing *.txt or *.smi file
Expand All @@ -402,22 +404,23 @@ private void preprocessMoleculeSet(IAtomContainerSet aMoleculeSet) throws NullPo
protected IAtomContainerSet importSMILESFile(File aFile) throws IOException {
try (
FileReader tmpSmilesFileReader = new FileReader(aFile);
BufferedReader tmpSmilesFileBufferedReader = new BufferedReader(tmpSmilesFileReader, BasicDefinitions.BUFFER_SIZE)
BufferedReader tmpSmilesFileBufferedReader = new BufferedReader(tmpSmilesFileReader,
BasicDefinitions.BUFFER_SIZE)
) {
IAtomContainerSet tmpAtomContainerSet = new AtomContainerSet();
//AtomContainer to save the parsed SMILES in
// AtomContainer to save the parsed SMILES in
IAtomContainer tmpMolecule = new AtomContainer();
SmilesParser tmpSmilesParser = new SmilesParser(SilentChemObjectBuilder.getInstance());
String tmpSmilesFileNextLine = "";
String tmpSmilesFileDeterminedSeparator = "";
String[] tmpProcessedLineArray;
String[] tmpProcessedLineArray = null;
int tmpSmilesCodeExpectedPosition = 0;
int tmpIDExpectedPosition = 0;
int tmpSmilesFileParsableLinesCounter = 0;
int tmpSmilesFileInvalidLinesCounter = 0;
//marking the BufferedReader to reset the reader after checking the format and determining the separator
// marking the BufferedReader to reset the reader after checking the format and determining the separator
tmpSmilesFileBufferedReader.mark(BasicDefinitions.BUFFER_SIZE);
//as potential headline the first line should be avoided for separator determination
// as potential headline the first line should be avoided for separator determination
String tmpSmilesFileFirstLine = tmpSmilesFileBufferedReader.readLine();
/* first block
Checking for parsable SMILES code and saving the determined separator (if one is used).
Expand All @@ -428,7 +431,7 @@ Checking for parsable SMILES code and saving the determined separator (if one is
findSeparatorLoop:
while (!Thread.currentThread().isInterrupted() && tmpFilesLine <= 3) {
if ((tmpSmilesFileNextLine = tmpSmilesFileBufferedReader.readLine()) == null) {
//if the file's end is reached at this point, the first line is used to determine the separator
// if the file's end is reached at this point, the first line is used to determine the separator
if (tmpSmilesFileFirstLine != null || !tmpSmilesFileFirstLine.isEmpty()) {
tmpSmilesFileNextLine = tmpSmilesFileFirstLine;
tmpSmilesFileFirstLine = null;
Expand All @@ -437,7 +440,8 @@ Checking for parsable SMILES code and saving the determined separator (if one is
}
}
for (String tmpSeparator : BasicDefinitions.POSSIBLE_SMILES_FILE_SEPARATORS) {
//maximum of two array elements expected, otherwise the separator or the line itself are assumed to be invalid
// maximum of two array elements expected, otherwise the separator or the line itself are assumed
// to be invalid
tmpProcessedLineArray = tmpSmilesFileNextLine.split(tmpSeparator, 3);
if (tmpProcessedLineArray.length > 2) {
continue;
Expand All @@ -455,9 +459,8 @@ Checking for parsable SMILES code and saving the determined separator (if one is
if (tmpProcessedLineArray.length > 1) {
if (tmpSmilesCodeExpectedPosition == 0) {
tmpIDExpectedPosition = 1;
} else {
tmpIDExpectedPosition = 0;
}
// else: tmpIDExpectedPosition = 0;
}
break findSeparatorLoop;
}
Expand All @@ -475,20 +478,30 @@ Checking for parsable SMILES code and saving the determined separator (if one is
tmpSmilesFileBufferedReader.reset();
tmpSmilesFileBufferedReader.mark(0); //to avoid the memory of unnecessary data
/* second block
Reading the file line by line and adding an AtomContainer to the AtomContainerSet for each line with parsable SMILES code
Reading the file line by line and adding an AtomContainer to the AtomContainerSet for each line with
parsable SMILES code
*/
while (!Thread.currentThread().isInterrupted() && (tmpSmilesFileNextLine = tmpSmilesFileBufferedReader.readLine()) != null) {
while (!Thread.currentThread().isInterrupted()
&& (tmpSmilesFileNextLine = tmpSmilesFileBufferedReader.readLine()) != null) {
//trying to parse as SMILES code
boolean tmpContainsParsableSmilesCode;
try {
tmpProcessedLineArray = tmpSmilesFileNextLine.split(tmpSmilesFileDeterminedSeparator, 2);
if (!tmpProcessedLineArray[tmpSmilesCodeExpectedPosition].isEmpty()) {
tmpMolecule = tmpSmilesParser.parseSmiles(tmpProcessedLineArray[tmpSmilesCodeExpectedPosition]);
tmpContainsParsableSmilesCode = true;
tmpSmilesFileParsableLinesCounter++;
} else {
tmpSmilesFileInvalidLinesCounter++;
continue;
tmpContainsParsableSmilesCode = false;
}
} catch (InvalidSmilesException | IndexOutOfBoundsException anException) { //case: invalid line or SMILES code
} catch (InvalidSmilesException | IndexOutOfBoundsException anException) {
//case: invalid line or SMILES code
tmpContainsParsableSmilesCode = false;
}
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Empty catch block?

Copy link
Collaborator Author

@SamuelBehr SamuelBehr Aug 10, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm, I see, I probably shouldn't do that. Would you prefer this here? :

            boolean tmpContainsParsableSmilesCode;    //removed the initialization
            try {
                tmpProcessedLineArray = tmpSmilesFileNextLine.split(tmpSmilesFileDeterminedSeparator, 2);
                if (!tmpProcessedLineArray[tmpSmilesCodeExpectedPosition].isEmpty()) {
                    tmpMolecule = tmpSmilesParser.parseSmiles(tmpProcessedLineArray[tmpSmilesCodeExpectedPosition]);
                    tmpContainsParsableSmilesCode = true;
                    tmpSmilesFileParsableLinesCounter++;
                } else {
                    tmpContainsParsableSmilesCode = false;    //new
                }
            } catch (InvalidSmilesException | IndexOutOfBoundsException anException) {
                //case: invalid line or SMILES code
                tmpContainsParsableSmilesCode = false;       //new
            }

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess the confusion comes from the expected SMILES position being empty (no exception) and the SMILES code being unparseable (exception) leading to the same code block which comes below. But yes, it is safer and more straightforward to understand to set these boolean values explicitly again in both cases, I'd say.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That causes the confusion.
Anyway, I find it a bit unusual to place the logging outside the catch block. Why not like this and get rid of the Boolean:

try {
     tmpProcessedLineArray = tmpSmilesFileNextLine.split(tmpSmilesFileDeterminedSeparator, 2);
     String tmpSmiles = tmpProcessedLineArray[tmpSmilesCodeExpectedPosition].isBlank() ? null : 
                                     tmpProcessedLineArray[tmpSmilesCodeExpectedPosition];
     tmpMolecule = tmpSmilesParser.parseSmiles(tmpSmiles);
     tmpSmilesFileParsableLinesCounter++;
 }
catch (InvalidSmilesException | IndexOutOfBoundsException anException) {
     int tmpIndexInFile = tmpSmilesFileParsableLinesCounter + tmpSmilesFileInvalidLinesCounter;
     Importer.LOGGER.info("Contains no parsable SMILES string: line " + tmpIndexInFile + " (index).");
     tmpSmilesFileInvalidLinesCounter++;
     continue;
}

This should cause an InvalidSmilesException if position in the file is blank or null.

if (!tmpContainsParsableSmilesCode) {
int tmpIndexInFile = tmpSmilesFileParsableLinesCounter + tmpSmilesFileInvalidLinesCounter;
Importer.LOGGER.info("Contains no parsable SMILES string: line " + tmpIndexInFile
+ " (index).");
tmpSmilesFileInvalidLinesCounter++;
continue;
}
Expand All @@ -497,14 +510,17 @@ Checking for parsable SMILES code and saving the determined separator (if one is
if (tmpProcessedLineArray.length > 1 && !tmpProcessedLineArray[tmpIDExpectedPosition].isEmpty()) {
tmpName = tmpProcessedLineArray[tmpIDExpectedPosition];
} else {
tmpName = FileUtil.getFileNameWithoutExtension(aFile) + tmpSmilesFileParsableLinesCounter;
int tmpIndexInFile = tmpSmilesFileParsableLinesCounter + tmpSmilesFileInvalidLinesCounter - 1;
tmpName = FileUtil.getFileNameWithoutExtension(aFile) + tmpIndexInFile;
}
tmpMolecule.setProperty(Importer.MOLECULE_NAME_PROPERTY_KEY, tmpName);
//adding tmpMolecule to the AtomContainerSet
tmpAtomContainerSet.addAtomContainer(tmpMolecule);
}
Importer.LOGGER.log(Level.INFO, "\tSmilesFile ParsableLinesCounter:\t" + tmpSmilesFileParsableLinesCounter +
"\n\tSmilesFile InvalidLinesCounter:\t\t" + tmpSmilesFileInvalidLinesCounter);
if (tmpSmilesFileInvalidLinesCounter > 0) {
Importer.LOGGER.info("\tSmilesFile ParsableLinesCount:\t" + tmpSmilesFileParsableLinesCounter +
"\n\tSmilesFile InvalidLinesCount:\t" + tmpSmilesFileInvalidLinesCounter);
}
return tmpAtomContainerSet;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ public final class BasicDefinitions {
/**
* Possible SMILES file separators used to separate SMILES code from ID
*/
public static final String[] POSSIBLE_SMILES_FILE_SEPARATORS = {"\t", ";", ",", " "};
public static final String[] POSSIBLE_SMILES_FILE_SEPARATORS = {"|", "\t", ";", ",", " "};
//</editor-fold>
//
// <editor-fold defaultstate="collapsed" desc="Buffer">
Expand Down