Skip to content

Commit

Permalink
Merge pull request #13 from ncats/index_large_mols
Browse files Browse the repository at this point in the history
Index large mols
  • Loading branch information
blueSwordfish authored Jun 4, 2024
2 parents 118149b + c77e726 commit 51fbd67
Show file tree
Hide file tree
Showing 4 changed files with 2,875 additions and 8 deletions.
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
<groupId>gov.nih.ncats</groupId>
<artifactId>structure-indexer</artifactId>
<packaging>jar</packaging>
<version>0.0.15</version>
<version>0.0.16</version>
<name>structure-indexer</name>
<url>https://github.com/ncats/structure-indexer</url>
<description>This is a self-contained structure indexer that uses Lucene as the underlying storage and indexing engine.</description>
Expand Down
21 changes: 14 additions & 7 deletions src/main/java/gov/nih/ncats/structureIndexer/StructureIndexer.java
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,8 @@ public class StructureIndexer {
static final int CODESIZE = 8; // 8-bit or 256
static final int CODEBOOKS = 256;

static final int MAX_ATOMS_V2000 = 999;

static final char[] ALPHA = {
'Q','X','Y','Z','U','V','W'
};
Expand Down Expand Up @@ -410,11 +412,11 @@ public Fingerprint getFpSim () {
public Chemical getMol () {
if (mol == null) {
String mol = doc.get(FIELD_MOLFILE);

logger.finest(String.format("in getMol, beginning of mol %s", (mol != null && mol.length() > 100 ?mol.substring(0, 99) : "blank/short")));

try {

this.mol = Chemical.parseMol(mol);
this.mol = Chemical.parse(mol);
// try{
// this.mol.aromatize();
// }catch(Exception e){
Expand All @@ -428,10 +430,12 @@ public Chemical getMol () {
}
catch (Exception ex) {
ex.printStackTrace();
System.err.println("bbadmol=\n"+mol);
throw new RuntimeException
String id = doc.get(FIELD_ID) != null ? doc.get(FIELD_ID) : "(unknown)";
System.err.printf("bbadmol (id=%s=\n%s\n", id, mol);
/*throw new RuntimeException
("Document "+doc.get(FIELD_ID)+" contains bogus "
+"field "+FIELD_MOLFILE+"!\n" , ex);
+"field "+FIELD_MOLFILE+"!\n" , ex);*/
return new Chemical();
}
}
return mol;
Expand Down Expand Up @@ -1198,9 +1202,12 @@ protected void instrument (Document doc, Chemical orig)
byte[] fpSim = fingerprintSim.toByteArray();

chemical.makeHydrogensExplicit();
String indexMolHExp = chemical.toMol(new ChemFormat.MolFormatSpecification()
/// if atomCount >= 1000, use alternate (SMILES)
String indexMolHExp = chemical.getAtomCount() > MAX_ATOMS_V2000 ?
chemical.toSmiles(new ChemFormat.SmilesFormatWriterSpecification().setKekulization(ChemFormat.KekulizationEncoding.FORCE_AROMATIC)) :
chemical.toMol(new ChemFormat.MolFormatSpecification()
.setKekulization(ChemFormat.KekulizationEncoding.FORCE_AROMATIC));
logger.finest(String.format("got indexMolHExp %s", indexMolHExp));

for (int i = 0; i < codebooks.length; ++i) {
Codebook cb = codebooks[i];
Expand Down
Loading

0 comments on commit 51fbd67

Please sign in to comment.