RDKit
Open-source cheminformatics and machine learning.
Embedder.h
Go to the documentation of this file.
1//
2// Copyright (C) 2004-2017 Greg Landrum and Rational Discovery LLC
3//
4// @@ All Rights Reserved @@
5// This file is part of the RDKit.
6// The contents are covered by the terms of the BSD license
7// which is included in the file license.txt, found at the root
8// of the RDKit source tree.
9//
10
11#include <RDGeneral/export.h>
12#ifndef RD_EMBEDDER_H_GUARD
13#define RD_EMBEDDER_H_GUARD
14
15#include <map>
16#include <utility>
17#include <Geometry/point.h>
18#include <GraphMol/ROMol.h>
19#include <boost/shared_ptr.hpp>
21
22namespace RDKit {
23namespace DGeomHelpers {
24
25//! Parameter object for controlling embedding
26/*!
27 numConfs Number of conformations to be generated
28 numThreads Sets the number of threads to use (more than one thread
29 will only be used if the RDKit was build with multithread
30 support) If set to zero, the max supported by the system will
31 be used.
32 maxIterations Max. number of times the embedding will be tried if
33 coordinates are not obtained successfully. The default
34 value is 10x the number of atoms.
35 randomSeed provides a seed for the random number generator (so that
36 the same coordinates can be obtained for a
37 molecule on multiple runs) If -1, the
38 RNG will not be seeded.
39 clearConfs Clear all existing conformations on the molecule
40 useRandomCoords Start the embedding from random coordinates instead of
41 using eigenvalues of the distance matrix.
42 boxSizeMult Determines the size of the box that is used for
43 random coordinates. If this is a positive number, the
44 side length will equal the largest element of the distance
45 matrix times \c boxSizeMult. If this is a negative number,
46 the side length will equal \c -boxSizeMult (i.e. independent
47 of the elements of the distance matrix).
48 randNegEig Picks coordinates at random when a embedding process produces
49 negative eigenvalues
50 numZeroFail Fail embedding if we find this many or more zero eigenvalues
51 (within a tolerance)
52 pruneRmsThresh Retain only the conformations out of 'numConfs' after
53 embedding that are at least this far apart from each other.
54 RMSD is computed on the heavy atoms.
55 Prunining is greedy; i.e. the first embedded conformation is
56 retained and from then on only those that are at least
57 \c pruneRmsThresh away from already
58 retained conformations are kept. The pruning is done
59 after embedding and bounds violation minimization.
60 No pruning by default.
61 coordMap a map of int to Point3D, between atom IDs and their locations
62 their locations. If this container is provided, the
63 coordinates are used to set distance constraints on the
64 embedding. The resulting conformer(s) should have distances
65 between the specified atoms that reproduce those between the
66 points in \c coordMap. Because the embedding produces a
67 molecule in an arbitrary reference frame, an alignment step
68 is required to actually reproduce the provided coordinates.
69 optimizerForceTol set the tolerance on forces in the DGeom optimizer
70 (this shouldn't normally be altered in client code).
71 ignoreSmoothingFailures try to embed the molecule even if triangle bounds
72 smoothing fails
73 enforceChirality enforce the correct chirality if chiral centers are present
74 useExpTorsionAnglePrefs impose experimental torsion-angle preferences
75 useBasicKnowledge impose "basic knowledge" terms such as flat
76 aromatic rings, ketones, etc.
77 ETversion version of the experimental torsion-angle preferences
78 verbose print output of experimental torsion-angle preferences
79 basinThresh set the basin threshold for the DGeom force field,
80 (this shouldn't normally be altered in client code).
81 onlyHeavyAtomsForRMS only use the heavy atoms when doing RMS filtering
82 boundsMat custom bound matrix to specify upper and lower bounds of atom
83 pairs
84 embedFragmentsSeparately embed each fragment of molecule in turn
85 useSmallRingTorsions optional torsions to improve small ring conformer
86 sampling
87 useMacrocycleTorsions optional torsions to improve macrocycle conformer
88 sampling
89 useMacrocycle14config If 1-4 distances bound heuristics for
90 macrocycles is used
91 CPCI custom columbic interactions between atom pairs
92 callback void pointer to a function for reporting progress,
93 will be called with the current iteration number.
94 forceTransAmides constrain amide bonds to be trans.
95 useSymmetryForPruning use molecule symmetry when doing the RMSD pruning.
96 NOTE that for reasons of computational efficiency,
97 setting this will also set onlyHeavyAtomsForRMS to
98 true.
99
100
101*/
103 unsigned int maxIterations{0};
104 int numThreads{1};
105 int randomSeed{-1};
106 bool clearConfs{true};
107 bool useRandomCoords{false};
108 double boxSizeMult{2.0};
109 bool randNegEig{true};
110 unsigned int numZeroFail{1};
111 const std::map<int, RDGeom::Point3D> *coordMap{nullptr};
112 double optimizerForceTol{1e-3};
113 bool ignoreSmoothingFailures{false};
114 bool enforceChirality{true};
115 bool useExpTorsionAnglePrefs{false};
116 bool useBasicKnowledge{false};
117 bool verbose{false};
118 double basinThresh{5.0};
119 double pruneRmsThresh{-1.0};
120 bool onlyHeavyAtomsForRMS{false};
121 unsigned int ETversion{1};
122 boost::shared_ptr<const DistGeom::BoundsMatrix> boundsMat;
123 bool embedFragmentsSeparately{true};
124 bool useSmallRingTorsions{false};
125 bool useMacrocycleTorsions{false};
126 bool useMacrocycle14config{false};
127 std::shared_ptr<std::map<std::pair<unsigned int, unsigned int>, double>> CPCI;
128 void (*callback)(unsigned int);
129 bool forceTransAmides{true};
130 bool useSymmetryForPruning{true};
131 double boundsMatForceScaling{1.0};
132 EmbedParameters() : boundsMat(nullptr), CPCI(nullptr), callback(nullptr) {}
134 unsigned int maxIterations, int numThreads, int randomSeed,
135 bool clearConfs, bool useRandomCoords, double boxSizeMult,
136 bool randNegEig, unsigned int numZeroFail,
137 const std::map<int, RDGeom::Point3D> *coordMap, double optimizerForceTol,
138 bool ignoreSmoothingFailures, bool enforceChirality,
139 bool useExpTorsionAnglePrefs, bool useBasicKnowledge, bool verbose,
140 double basinThresh, double pruneRmsThresh, bool onlyHeavyAtomsForRMS,
141 unsigned int ETversion = 1,
142 const DistGeom::BoundsMatrix *boundsMat = nullptr,
143 bool embedFragmentsSeparately = true, bool useSmallRingTorsions = false,
144 bool useMacrocycleTorsions = false, bool useMacrocycle14config = false,
145 std::shared_ptr<std::map<std::pair<unsigned int, unsigned int>, double>>
146 CPCI = nullptr,
147 void (*callback)(unsigned int) = nullptr)
148 : maxIterations(maxIterations),
149 numThreads(numThreads),
150 randomSeed(randomSeed),
151 clearConfs(clearConfs),
152 useRandomCoords(useRandomCoords),
153 boxSizeMult(boxSizeMult),
154 randNegEig(randNegEig),
155 numZeroFail(numZeroFail),
156 coordMap(coordMap),
157 optimizerForceTol(optimizerForceTol),
158 ignoreSmoothingFailures(ignoreSmoothingFailures),
159 enforceChirality(enforceChirality),
160 useExpTorsionAnglePrefs(useExpTorsionAnglePrefs),
161 useBasicKnowledge(useBasicKnowledge),
162 verbose(verbose),
163 basinThresh(basinThresh),
164 pruneRmsThresh(pruneRmsThresh),
165 onlyHeavyAtomsForRMS(onlyHeavyAtomsForRMS),
166 ETversion(ETversion),
167 boundsMat(boundsMat),
168 embedFragmentsSeparately(embedFragmentsSeparately),
169 useSmallRingTorsions(useSmallRingTorsions),
170 useMacrocycleTorsions(useMacrocycleTorsions),
171 useMacrocycle14config(useMacrocycle14config),
172 CPCI(std::move(CPCI)),
173 callback(callback) {}
174};
175
176//! update parameters from a JSON string
178 EmbedParameters &params, const std::string &json);
179
180//! Embed multiple conformations for a molecule
182 ROMol &mol, INT_VECT &res, unsigned int numConfs,
183 const EmbedParameters &params);
184inline INT_VECT EmbedMultipleConfs(ROMol &mol, unsigned int numConfs,
185 const EmbedParameters &params) {
186 INT_VECT res;
187 EmbedMultipleConfs(mol, res, numConfs, params);
188 return res;
189}
190
191//! Compute an embedding (in 3D) for the specified molecule using Distance
192/// Geometry
193inline int EmbedMolecule(ROMol &mol, const EmbedParameters &params) {
194 INT_VECT confIds;
195 EmbedMultipleConfs(mol, confIds, 1, params);
196
197 int res;
198 if (confIds.size()) {
199 res = confIds[0];
200 } else {
201 res = -1;
202 }
203 return res;
204}
205
206//! Compute an embedding (in 3D) for the specified molecule using Distance
207/// Geometry
208/*!
209 The following operations are performed (in order) here:
210 -# Build a distance bounds matrix based on the topology, including 1-5
211 distances but not VDW scaling
212 -# Triangle smooth this bounds matrix
213 -# If step 2 fails - repeat step 1, this time without 1-5 bounds and with vdW
214 scaling, and repeat step 2
215 -# Pick a distance matrix at random using the bounds matrix
216 -# Compute initial coordinates from the distance matrix
217 -# Repeat steps 3 and 4 until maxIterations is reached or embedding is
218 successful
219 -# Adjust initial coordinates by minimizing a Distance Violation error
220 function
221 **NOTE**: if the molecule has multiple fragments, they will be embedded
222 separately,
223 this means that they will likely occupy the same region of space.
224 \param mol Molecule of interest
225 \param maxIterations Max. number of times the embedding will be tried if
226 coordinates are not obtained successfully. The default
227 value is 10x the number of atoms.
228 \param seed provides a seed for the random number generator (so that
229 the same coordinates can be obtained for a molecule on
230 multiple runs). If negative, the RNG will not be seeded.
231 \param clearConfs Clear all existing conformations on the molecule
232 \param useRandomCoords Start the embedding from random coordinates instead of
233 using eigenvalues of the distance matrix.
234 \param boxSizeMult Determines the size of the box that is used for
235 random coordinates. If this is a positive number, the
236 side length will equal the largest element of the
237 distance matrix times \c boxSizeMult. If this is a
238 negative number, the side length will equal
239 \c -boxSizeMult (i.e. independent of the elements of the
240 distance matrix).
241 \param randNegEig Picks coordinates at random when a embedding process
242 produces negative eigenvalues
243 \param numZeroFail Fail embedding if we find this many or more zero
244 eigenvalues (within a tolerance)
245 \param coordMap a map of int to Point3D, between atom IDs and their locations
246 their locations. If this container is provided, the
247 coordinates are used to set distance constraints on the
248 embedding. The resulting conformer(s) should have distances
249 between the specified atoms that reproduce those between the
250 points in \c coordMap. Because the embedding produces a
251 molecule in an arbitrary reference frame, an alignment step
252 is required to actually reproduce the provided coordinates.
253 \param optimizerForceTol set the tolerance on forces in the distgeom optimizer
254 (this shouldn't normally be altered in client code).
255 \param ignoreSmoothingFailures try to embed the molecule even if triangle
256 bounds smoothing fails
257 \param enforceChirality enforce the correct chirality if chiral centers are
258 present
259 \param useExpTorsionAnglePrefs impose experimental torsion-angle preferences
260 \param useBasicKnowledge impose "basic knowledge" terms such as flat
261 aromatic rings, ketones, etc.
262 \param verbose print output of experimental torsion-angle preferences
263 \param basinThresh set the basin threshold for the DGeom force field,
264 (this shouldn't normally be altered in client code).
265 \param onlyHeavyAtomsForRMS only use the heavy atoms when doing RMS filtering
266 \param ETversion version of torsion preferences to use
267 \param useSmallRingTorsions optional torsions to improve small ring
268 conformer sampling
269
270 \param useMacrocycleTorsions optional torsions to improve macrocycle
271 conformer sampling \param useMacrocycle14config If 1-4 distances bound
272 heuristics for macrocycles is used \return ID of the conformations added to
273 the molecule, -1 if the emdedding failed
274*/
275inline int EmbedMolecule(
276 ROMol &mol, unsigned int maxIterations = 0, int seed = -1,
277 bool clearConfs = true, bool useRandomCoords = false,
278 double boxSizeMult = 2.0, bool randNegEig = true,
279 unsigned int numZeroFail = 1,
280 const std::map<int, RDGeom::Point3D> *coordMap = nullptr,
281 double optimizerForceTol = 1e-3, bool ignoreSmoothingFailures = false,
282 bool enforceChirality = true, bool useExpTorsionAnglePrefs = false,
283 bool useBasicKnowledge = false, bool verbose = false,
284 double basinThresh = 5.0, bool onlyHeavyAtomsForRMS = false,
285 unsigned int ETversion = 1, bool useSmallRingTorsions = false,
286 bool useMacrocycleTorsions = false, bool useMacrocycle14config = false) {
287 EmbedParameters params(
288 maxIterations, 1, seed, clearConfs, useRandomCoords, boxSizeMult,
289 randNegEig, numZeroFail, coordMap, optimizerForceTol,
290 ignoreSmoothingFailures, enforceChirality, useExpTorsionAnglePrefs,
291 useBasicKnowledge, verbose, basinThresh, -1.0, onlyHeavyAtomsForRMS,
292 ETversion, nullptr, true, useSmallRingTorsions, useMacrocycleTorsions,
293 useMacrocycle14config);
294 return EmbedMolecule(mol, params);
295};
296
297//*! Embed multiple conformations for a molecule
298/*!
299 This is kind of equivalent to calling EmbedMolecule multiple times - just that
300 the bounds
301 matrix is computed only once from the topology
302 **NOTE**: if the molecule has multiple fragments, they will be embedded
303 separately,
304 this means that they will likely occupy the same region of space.
305 \param mol Molecule of interest
306 \param res Used to return the resulting conformer ids
307 \param numConfs Number of conformations to be generated
308 \param numThreads Sets the number of threads to use (more than one thread
309 will only be used if the RDKit was build with
310 multithread
311 support). If set to zero, the max supported by the
312 system
313 will be used.
314 \param maxIterations Max. number of times the embedding will be tried if
315 coordinates are not obtained successfully. The default
316 value is 10x the number of atoms.
317 \param seed provides a seed for the random number generator (so that
318 the same coordinates can be obtained for a molecule on
319 multiple runs). If negative, the RNG will not be seeded.
320 \param clearConfs Clear all existing conformations on the molecule
321 \param useRandomCoords Start the embedding from random coordinates instead of
322 using eigenvalues of the distance matrix.
323 \param boxSizeMult Determines the size of the box that is used for
324 random coordinates. If this is a positive number, the
325 side length will equal the largest element of the
326 distance matrix times \c boxSizeMult. If this is a
327 negative number, the side length will equal
328 \c -boxSizeMult (i.e. independent of the elements of the
329 distance matrix).
330 \param randNegEig Picks coordinates at random when a embedding process
331 produces negative eigenvalues
332 \param numZeroFail Fail embedding if we find this many or more zero
333 eigenvalues (within a tolerance)
334 \param pruneRmsThresh Retain only the conformations out of 'numConfs' after
335 embedding that are at least this far apart from each
336 other. RMSD is computed on the heavy atoms.
337 Pruning is greedy; i.e. the first embedded conformation
338 is retained and from then on only those that are at
339 least
340 pruneRmsThresh away from already retained conformations
341 are kept. The pruning is done after embedding and
342 bounds violation minimization. No pruning by default.
343 \param coordMap a map of int to Point3D, between atom IDs and their locations
344 their locations. If this container is provided, the
345 coordinates are used to set distance constraints on the
346 embedding. The resulting conformer(s) should have distances
347 between the specified atoms that reproduce those between the
348 points in \c coordMap. Because the embedding produces a
349 molecule in an arbitrary reference frame, an alignment step
350 is required to actually reproduce the provided coordinates.
351 \param optimizerForceTol set the tolerance on forces in the DGeom optimizer
352 (this shouldn't normally be altered in client code).
353 \param ignoreSmoothingFailures try to embed the molecule even if triangle
354 bounds smoothing fails
355 \param enforceChirality enforce the correct chirality if chiral centers are
356 present
357 \param useExpTorsionAnglePrefs impose experimental torsion-angle preferences
358 \param useBasicKnowledge impose "basic knowledge" terms such as flat
359 aromatic rings, ketones, etc.
360 \param verbose print output of experimental torsion-angle preferences
361 \param basinThresh set the basin threshold for the DGeom force field,
362 (this shouldn't normally be altered in client code).
363 \param onlyHeavyAtomsForRMS only use the heavy atoms when doing RMS filtering
364 \param ETversion version of torsion preferences to use
365 \param useSmallRingTorsions optional torsions to improve small ring
366 conformer sampling
367
368 \param useMacrocycleTorsions optional torsions to improve macrocycle
369 conformer sampling \param useMacrocycle14config If 1-4 distances bound
370 heuristics for macrocycles is used
371
372*/
374 ROMol &mol, INT_VECT &res, unsigned int numConfs = 10, int numThreads = 1,
375 unsigned int maxIterations = 30, int seed = -1, bool clearConfs = true,
376 bool useRandomCoords = false, double boxSizeMult = 2.0,
377 bool randNegEig = true, unsigned int numZeroFail = 1,
378 double pruneRmsThresh = -1.0,
379 const std::map<int, RDGeom::Point3D> *coordMap = nullptr,
380 double optimizerForceTol = 1e-3, bool ignoreSmoothingFailures = false,
381 bool enforceChirality = true, bool useExpTorsionAnglePrefs = false,
382 bool useBasicKnowledge = false, bool verbose = false,
383 double basinThresh = 5.0, bool onlyHeavyAtomsForRMS = false,
384 unsigned int ETversion = 1, bool useSmallRingTorsions = false,
385 bool useMacrocycleTorsions = false, bool useMacrocycle14config = false) {
386 EmbedParameters params(
387 maxIterations, numThreads, seed, clearConfs, useRandomCoords, boxSizeMult,
388 randNegEig, numZeroFail, coordMap, optimizerForceTol,
389 ignoreSmoothingFailures, enforceChirality, useExpTorsionAnglePrefs,
390 useBasicKnowledge, verbose, basinThresh, pruneRmsThresh,
391 onlyHeavyAtomsForRMS, ETversion, nullptr, true, useSmallRingTorsions,
392 useMacrocycleTorsions, useMacrocycle14config);
393 EmbedMultipleConfs(mol, res, numConfs, params);
394};
395//! \overload
397 ROMol &mol, unsigned int numConfs = 10, unsigned int maxIterations = 30,
398 int seed = -1, bool clearConfs = true, bool useRandomCoords = false,
399 double boxSizeMult = 2.0, bool randNegEig = true,
400 unsigned int numZeroFail = 1, double pruneRmsThresh = -1.0,
401 const std::map<int, RDGeom::Point3D> *coordMap = nullptr,
402 double optimizerForceTol = 1e-3, bool ignoreSmoothingFailures = false,
403 bool enforceChirality = true, bool useExpTorsionAnglePrefs = false,
404 bool useBasicKnowledge = false, bool verbose = false,
405 double basinThresh = 5.0, bool onlyHeavyAtomsForRMS = false,
406 unsigned int ETversion = 1, bool useSmallRingTorsions = false,
407 bool useMacrocycleTorsions = false, bool useMacrocycle14config = false) {
408 EmbedParameters params(
409 maxIterations, 1, seed, clearConfs, useRandomCoords, boxSizeMult,
410 randNegEig, numZeroFail, coordMap, optimizerForceTol,
411 ignoreSmoothingFailures, enforceChirality, useExpTorsionAnglePrefs,
412 useBasicKnowledge, verbose, basinThresh, pruneRmsThresh,
413 onlyHeavyAtomsForRMS, ETversion, nullptr, true, useSmallRingTorsions,
414 useMacrocycleTorsions, useMacrocycle14config);
415 INT_VECT res;
416 EmbedMultipleConfs(mol, res, numConfs, params);
417 return res;
418};
419
420//! Parameters corresponding to Sereina Riniker's KDG approach
421RDKIT_DISTGEOMHELPERS_EXPORT extern const EmbedParameters KDG;
422//! Parameters corresponding to Sereina Riniker's ETDG approach
423RDKIT_DISTGEOMHELPERS_EXPORT extern const EmbedParameters ETDG;
424//! Parameters corresponding to Sereina Riniker's ETKDG approach
425RDKIT_DISTGEOMHELPERS_EXPORT extern const EmbedParameters ETKDG;
426//! Parameters corresponding to Sereina Riniker's ETKDG approach - version 2
427RDKIT_DISTGEOMHELPERS_EXPORT extern const EmbedParameters ETKDGv2;
428//! Parameters corresponding improved ETKDG by Wang, Witek, Landrum and Riniker
429//! (10.1021/acs.jcim.0c00025) - the macrocycle part
430RDKIT_DISTGEOMHELPERS_EXPORT extern const EmbedParameters ETKDGv3;
431//! Parameters corresponding improved ETKDG by Wang, Witek, Landrum and Riniker
432//! (10.1021/acs.jcim.0c00025) - the small ring part
433RDKIT_DISTGEOMHELPERS_EXPORT extern const EmbedParameters srETKDGv3;
434} // namespace DGeomHelpers
435} // namespace RDKit
436
437#endif
Defines the primary molecule class ROMol as well as associated typedefs.
Class to store the distance bound.
Definition: BoundsMatrix.h:28
#define RDKIT_DISTGEOMHELPERS_EXPORT
Definition: export.h:121
RDKIT_DISTGEOMHELPERS_EXPORT const EmbedParameters ETKDGv2
Parameters corresponding to Sereina Riniker's ETKDG approach - version 2.
RDKIT_DISTGEOMHELPERS_EXPORT const EmbedParameters ETDG
Parameters corresponding to Sereina Riniker's ETDG approach.
RDKIT_DISTGEOMHELPERS_EXPORT const EmbedParameters ETKDGv3
RDKIT_DISTGEOMHELPERS_EXPORT void updateEmbedParametersFromJSON(EmbedParameters &params, const std::string &json)
update parameters from a JSON string
int EmbedMolecule(ROMol &mol, const EmbedParameters &params)
Definition: Embedder.h:193
RDKIT_DISTGEOMHELPERS_EXPORT void EmbedMultipleConfs(ROMol &mol, INT_VECT &res, unsigned int numConfs, const EmbedParameters &params)
Embed multiple conformations for a molecule.
RDKIT_DISTGEOMHELPERS_EXPORT const EmbedParameters ETKDG
Parameters corresponding to Sereina Riniker's ETKDG approach.
RDKIT_DISTGEOMHELPERS_EXPORT const EmbedParameters srETKDGv3
RDKIT_DISTGEOMHELPERS_EXPORT const EmbedParameters KDG
Parameters corresponding to Sereina Riniker's KDG approach.
const uint32_t seed
Definition: MHFP.h:29
Std stuff.
Definition: Abbreviations.h:19
std::vector< int > INT_VECT
Definition: types.h:278
Parameter object for controlling embedding.
Definition: Embedder.h:102
EmbedParameters(unsigned int maxIterations, int numThreads, int randomSeed, bool clearConfs, bool useRandomCoords, double boxSizeMult, bool randNegEig, unsigned int numZeroFail, const std::map< int, RDGeom::Point3D > *coordMap, double optimizerForceTol, bool ignoreSmoothingFailures, bool enforceChirality, bool useExpTorsionAnglePrefs, bool useBasicKnowledge, bool verbose, double basinThresh, double pruneRmsThresh, bool onlyHeavyAtomsForRMS, unsigned int ETversion=1, const DistGeom::BoundsMatrix *boundsMat=nullptr, bool embedFragmentsSeparately=true, bool useSmallRingTorsions=false, bool useMacrocycleTorsions=false, bool useMacrocycle14config=false, std::shared_ptr< std::map< std::pair< unsigned int, unsigned int >, double > > CPCI=nullptr, void(*callback)(unsigned int)=nullptr)
Definition: Embedder.h:133
boost::shared_ptr< const DistGeom::BoundsMatrix > boundsMat
Definition: Embedder.h:122
std::shared_ptr< std::map< std::pair< unsigned int, unsigned int >, double > > CPCI
Definition: Embedder.h:127