forked from rdkit/rdkit
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* First pass at splitting molecule. * Interim commit. Reading libraries from file in original format. * Basic search seems to be working. * Pattern fingerprint screening. * Connector region heuristic. * Fixed triazole (aromatic/non-aromatic connectors). * Fix search with non-split parent query, where query is substructure of a single reagent. * Remove duplicate hits by reaction/reagents used. * Implement largest fragment heuristic. * Extra test files. * Read/write binary file. Program for conversion from text format to binary format. * Remove empty reagent sets on reading, probably due to synthon number counting from 1 rather than 0. * Tidy SSSearch functions. * Stash pending major surgery for triazole bug. * Revert to using unique_ptr. Correct use of reagent order. * Function to summarise Hyperspace. * Delay building hits till end and put cutoff on number. * Earlier bale-out in getHitReagents. * Streamline checkConnectorRegions. * Remove free functions for search. * Correct name of Python test. * First stage of Python wrappers. * Rename namespace. * Parameters object. * Mysterious windows export thing. * Fix bug - not matching number of connectors in fragment and synthon. * Back like it was. The connector count wasn't the problem. * Put the substructure results into their own class. * gcc 14 didn't like my use of std::reduce. Update expected test results. * Remove write statement. * Tidy. * Tidy. * Enable random sample of hits. * Test that complex SMARTS works. Update Python wrappers. * Rename Hyperspace to SynthonSpace. * More renaming. Python test. * Enable Python test. Remove write. * Plug memory leak. * Response to Greg's initial look. * More response to Greg's initial look. * get the windows DLL builds working * Do away with mutable. Purge a few more uses of reagent in favour of synthon. Remove the c++ exe for converting text to binary databases. * Better Synthon c'tor. * More feedback from Greg. * Tidy the Python wrapper. * Remove tags from catch tests. * Don't allow copying of SubstructureResults. * Revert to allow copying of SubstructureResults. The Python wrapper needs it. * Refinements based on CLion/clangd suggestions. * Allow for map numbers in connectors in space file. * Response to review. * update binary file spec * Changes after review. --------- Co-authored-by: David Cosgrove <[email protected]> Co-authored-by: Greg Landrum <[email protected]>
- Loading branch information
1 parent
ff93b29
commit eaf544a
Showing
29 changed files
with
37,865 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
|
||
rdkit_library(SynthonSpaceSearch | ||
SynthonSpaceSearch_details.cpp SynthonSpace.cpp SynthonSet.cpp Synthon.cpp | ||
LINK_LIBRARIES SmilesParse FileParsers ChemTransforms Fingerprints SubstructMatch GraphMol) | ||
target_compile_definitions(SynthonSpaceSearch PRIVATE RDKIT_SYNTHONSPACESEARCH_BUILD) | ||
|
||
rdkit_headers(SynthonSpace.h SynthonSet.h Synthon.h SubstructureResults.h | ||
DEST GraphMol/SynthonSpaceSearch) | ||
|
||
rdkit_catch_test(testSynthonSpaceSearch synthon_space_search_catch.cpp | ||
LINK_LIBRARIES SubstructMatch SubstructLibrary SynthonSpaceSearch) | ||
|
||
if (RDK_BUILD_PYTHON_WRAPPERS) | ||
add_subdirectory(Wrap) | ||
endif () |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
// | ||
// Copyright (C) David Cosgrove 2024. | ||
// | ||
// @@ All Rights Reserved @@ | ||
// This file is part of the RDKit. | ||
// The contents are covered by the terms of the BSD license | ||
// which is included in the file license.txt, found at the root | ||
// of the RDKit source tree. | ||
// | ||
|
||
#ifndef RDKIT_SYNTHONSPACE_SUBSTRUCTURERESULTS_H | ||
#define RDKIT_SYNTHONSPACE_SUBSTRUCTURERESULTS_H | ||
|
||
#include <GraphMol/ROMol.h> | ||
|
||
namespace RDKit::SynthonSpaceSearch { | ||
class RDKIT_SYNTHONSPACESEARCH_EXPORT SubstructureResults { | ||
public: | ||
explicit SubstructureResults() : d_maxNumResults(0) {} | ||
SubstructureResults(std::vector<std::unique_ptr<ROMol>> &&mols, | ||
size_t maxNumRes); | ||
SubstructureResults(const SubstructureResults &other); | ||
SubstructureResults(SubstructureResults &&other) = default; | ||
~SubstructureResults() = default; | ||
|
||
SubstructureResults &operator=(const SubstructureResults &other); | ||
SubstructureResults &operator=(SubstructureResults &&other) = default; | ||
|
||
/*! | ||
* Returns the upper bound on the number of results the search might return. | ||
* There may be fewer than this in practice for several reasons such as | ||
* duplicate reagent sets being removed or the final product not matching the | ||
* query even though the synthons suggested it would. | ||
* | ||
* @return int | ||
*/ | ||
size_t getMaxNumResults() const { return d_maxNumResults; } | ||
/*! | ||
* Returns the hits from the search. Not necessarily all those possible, | ||
* just the maximum number requested. | ||
* | ||
* @return std::vector<std::unique_ptr<ROMol>> | ||
*/ | ||
const std::vector<std::unique_ptr<ROMol>> &getHitMolecules() const { | ||
return d_hitMolecules; | ||
} | ||
|
||
private: | ||
std::vector<std::unique_ptr<ROMol>> d_hitMolecules; | ||
size_t d_maxNumResults; | ||
}; | ||
|
||
inline SubstructureResults::SubstructureResults( | ||
std::vector<std::unique_ptr<ROMol>> &&mols, size_t maxNumRes) | ||
: d_maxNumResults(maxNumRes) { | ||
d_hitMolecules = std::move(mols); | ||
mols.clear(); | ||
} | ||
|
||
inline SubstructureResults::SubstructureResults( | ||
const RDKit::SynthonSpaceSearch::SubstructureResults &other) | ||
: d_maxNumResults(other.d_maxNumResults) { | ||
for (const auto &hm : other.d_hitMolecules) { | ||
d_hitMolecules.emplace_back(new ROMol(*hm)); | ||
} | ||
} | ||
} // namespace RDKit::SynthonSpaceSearch | ||
|
||
#endif // RDKIT_SUBSTRUCTURERESULTS_H |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,162 @@ | ||
// | ||
// Copyright (C) David Cosgrove 2024. | ||
// | ||
// @@ All Rights Reserved @@ | ||
// This file is part of the RDKit. | ||
// The contents are covered by the terms of the BSD license | ||
// which is included in the file license.txt, found at the root | ||
// of the RDKit source tree. | ||
// | ||
|
||
#include <DataStructs/ExplicitBitVect.h> | ||
#include <GraphMol/MolOps.h> | ||
#include <GraphMol/MolPickler.h> | ||
#include <GraphMol/Fingerprints/Fingerprints.h> | ||
#include <GraphMol/SynthonSpaceSearch/Synthon.h> | ||
#include <GraphMol/SmilesParse/SmilesParse.h> | ||
|
||
namespace RDKit::SynthonSpaceSearch { | ||
|
||
Synthon::Synthon(const std::string &smi, const std::string &id) | ||
: d_smiles(smi), d_id(id) { | ||
dp_mol = v2::SmilesParse::MolFromSmiles(d_smiles); | ||
if (!dp_mol) { | ||
// This should be rare, as it should be possible to assume that | ||
// the people who made the SynthonSpace know what they're doing. | ||
// Therefore, it's probably a corrupted or incorrect file, so | ||
// bring it all down. | ||
throw ValueErrorException("Unparsable synthon SMILES " + d_smiles + | ||
" with ID " + d_id); | ||
} | ||
dp_mol->setProp<std::string>(common_properties::_Name, d_id); | ||
|
||
dp_pattFP.reset(PatternFingerprintMol(*getMol(), 2048)); | ||
|
||
if (auto cr = getConnRegion(*getMol()); cr) { | ||
std::vector<std::unique_ptr<ROMol>> tmpFrags; | ||
MolOps::getMolFrags(*cr, tmpFrags, false); | ||
for (auto &f : tmpFrags) { | ||
d_connRegions.push_back(std::shared_ptr<ROMol>(f.release())); | ||
} | ||
} | ||
} | ||
|
||
Synthon::Synthon(const Synthon &other) | ||
: d_smiles(other.d_smiles), | ||
d_id(other.d_id), | ||
dp_mol(std::make_unique<ROMol>(*other.dp_mol)), | ||
dp_pattFP(std::make_unique<ExplicitBitVect>(*other.dp_pattFP)), | ||
d_connRegions(other.d_connRegions) {} | ||
|
||
Synthon &Synthon::operator=(const Synthon &other) { | ||
if (this == &other) { | ||
return *this; | ||
} | ||
d_smiles = other.d_smiles; | ||
d_id = other.d_id; | ||
if (other.dp_mol) { | ||
dp_mol = std::make_unique<ROMol>(*other.dp_mol); | ||
} else { | ||
dp_mol.reset(); | ||
} | ||
if (other.dp_pattFP) { | ||
dp_pattFP = std::make_unique<ExplicitBitVect>(*other.dp_pattFP); | ||
} else { | ||
dp_pattFP.reset(); | ||
} | ||
if (!other.d_connRegions.empty()) { | ||
d_connRegions.clear(); | ||
std::transform( | ||
other.d_connRegions.begin(), other.d_connRegions.end(), | ||
std::back_inserter(d_connRegions), | ||
[](const std::shared_ptr<ROMol> &m) -> std::shared_ptr<ROMol> { | ||
return std::make_shared<ROMol>(*m); | ||
}); | ||
} else { | ||
d_connRegions.clear(); | ||
} | ||
return *this; | ||
} | ||
|
||
const std::unique_ptr<ROMol> &Synthon::getMol() const { return dp_mol; } | ||
|
||
const std::unique_ptr<ExplicitBitVect> &Synthon::getPattFP() const { | ||
return dp_pattFP; | ||
} | ||
|
||
const std::vector<std::shared_ptr<ROMol>> &Synthon::getConnRegions() const { | ||
return d_connRegions; | ||
} | ||
|
||
void Synthon::writeToDBStream(std::ostream &os) const { | ||
streamWrite(os, d_smiles); | ||
streamWrite(os, d_id); | ||
MolPickler::pickleMol(*getMol(), os, PicklerOps::AllProps); | ||
auto pattFPstr = getPattFP()->toString(); | ||
streamWrite(os, pattFPstr); | ||
streamWrite(os, getConnRegions().size()); | ||
for (const auto &cr : getConnRegions()) { | ||
MolPickler::pickleMol(*cr, os, PicklerOps::AllProps); | ||
} | ||
} | ||
|
||
void Synthon::readFromDBStream(std::istream &is) { | ||
streamRead(is, d_smiles, 0); | ||
streamRead(is, d_id, 0); | ||
dp_mol = std::make_unique<ROMol>(); | ||
MolPickler::molFromPickle(is, *dp_mol); | ||
std::string pickle; | ||
streamRead(is, pickle, 0); | ||
dp_pattFP = std::make_unique<ExplicitBitVect>(pickle); | ||
size_t numConnRegs; | ||
streamRead(is, numConnRegs); | ||
d_connRegions.resize(numConnRegs); | ||
for (size_t i = 0; i < numConnRegs; ++i) { | ||
d_connRegions[i] = std::make_shared<ROMol>(); | ||
MolPickler::molFromPickle(is, *d_connRegions[i]); | ||
} | ||
} | ||
|
||
std::unique_ptr<ROMol> getConnRegion(const ROMol &mol) { | ||
boost::dynamic_bitset<> inFrag(mol.getNumAtoms()); | ||
for (const auto a : mol.atoms()) { | ||
if (!a->getAtomicNum() && a->getIsotope()) { | ||
inFrag[a->getIdx()] = true; | ||
for (const auto &n1 : mol.atomNeighbors(a)) { | ||
if (!inFrag[n1->getIdx()]) { | ||
inFrag[n1->getIdx()] = true; | ||
for (const auto &n2 : mol.atomNeighbors(n1)) { | ||
if (!inFrag[n2->getIdx()]) { | ||
inFrag[n2->getIdx()] = true; | ||
for (const auto &n3 : mol.atomNeighbors(n2)) { | ||
inFrag[n3->getIdx()] = true; | ||
} | ||
} | ||
} | ||
} | ||
} | ||
} | ||
} | ||
if (!inFrag.count()) { | ||
return std::unique_ptr<RWMol>(); | ||
} | ||
|
||
std::unique_ptr<RWMol> molCp(new RWMol(mol)); | ||
molCp->beginBatchEdit(); | ||
for (auto &aCp : molCp->atoms()) { | ||
if (!inFrag[aCp->getIdx()]) { | ||
molCp->removeAtom(aCp); | ||
} else { | ||
if (!aCp->getAtomicNum()) { | ||
aCp->setIsotope(1); | ||
if (aCp->hasQuery()) { | ||
aCp->expandQuery(makeAtomIsotopeQuery(1), Queries::COMPOSITE_OR); | ||
} | ||
} | ||
} | ||
} | ||
molCp->commitBatchEdit(); | ||
return molCp; | ||
} | ||
|
||
} // namespace RDKit::SynthonSpaceSearch |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
// | ||
// Copyright (C) David Cosgrove 2024. | ||
// | ||
// @@ All Rights Reserved @@ | ||
// This file is part of the RDKit. | ||
// The contents are covered by the terms of the BSD license | ||
// which is included in the file license.txt, found at the root | ||
// of the RDKit source tree. | ||
// | ||
|
||
#ifndef RDKIT_REAGENT_H | ||
#define RDKIT_REAGENT_H | ||
|
||
#include <string> | ||
|
||
#include <DataStructs/ExplicitBitVect.h> | ||
#include <GraphMol/ROMol.h> | ||
|
||
namespace RDKit { | ||
class Atom; | ||
|
||
namespace SynthonSpaceSearch { | ||
|
||
// This class holds a Synthon that will be part of a SynthonSet. | ||
class RDKIT_SYNTHONSPACESEARCH_EXPORT Synthon { | ||
public: | ||
Synthon() = default; | ||
Synthon(const std::string &smi, const std::string &id); | ||
Synthon(const Synthon &other); | ||
Synthon(Synthon &&other) = default; | ||
Synthon &operator=(const Synthon &other); | ||
Synthon &operator=(Synthon &&other) = default; | ||
|
||
[[nodiscard]] const std::string &getSmiles() const { return d_smiles; } | ||
[[nodiscard]] const std::string &getId() const { return d_id; } | ||
[[nodiscard]] const std::unique_ptr<ROMol> &getMol() const; | ||
[[nodiscard]] const std::unique_ptr<ExplicitBitVect> &getPattFP() const; | ||
[[nodiscard]] const std::vector<std::shared_ptr<ROMol>> &getConnRegions() | ||
const; | ||
|
||
// Writes to/reads from a binary stream. | ||
void writeToDBStream(std::ostream &os) const; | ||
void readFromDBStream(std::istream &is); | ||
|
||
private: | ||
std::string d_smiles; | ||
std::string d_id; | ||
|
||
std::unique_ptr<ROMol> dp_mol{nullptr}; | ||
std::unique_ptr<ExplicitBitVect> dp_pattFP{nullptr}; | ||
// SMILES strings of any connector regions. Normally there will only | ||
// be 1 or 2. | ||
std::vector<std::shared_ptr<ROMol>> d_connRegions; | ||
}; | ||
|
||
// Return a molecule containing the portions of the molecule starting at | ||
// each dummy atom and going out up to 3 bonds. There may be more than | ||
// 1 fragment if there are dummy atoms more than 3 bonds apart, and there | ||
// may be fragments with more than 1 dummy atom if their fragments fall | ||
// within 3 bonds of each other. E.g. the molecule [1*]CN(C[2*])Cc1ccccc1 | ||
// will give [1*]CN(C)C[1*]. The 2 dummy atoms are 4 bonds apart, but the | ||
// fragments overlap. All dummy atoms given isotope 1 whatever they had before. | ||
RDKIT_SYNTHONSPACESEARCH_EXPORT std::unique_ptr<ROMol> getConnRegion( | ||
const ROMol &mol); | ||
|
||
} // namespace SynthonSpaceSearch | ||
} // namespace RDKit | ||
|
||
#endif // RDKIT_REAGENT_H |
Oops, something went wrong.