Skip to content

Calculators

get_calculator(name, **params)

Get molecular calculator based on name

Parameters:

Name Type Description Default
name str

Name of the featurizer

required
params dict

Parameters of the featurizer

{}

Raises:

Type Description
ValueError

When featurizer is not supported

Returns:

Name Type Description
featurizer

Callable

Source code in molfeat/calc/__init__.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
def get_calculator(name: str, **params):
    """Get molecular calculator based on name

    Args:
        name: Name of the featurizer
        params (dict): Parameters of the featurizer

    Raises:
        ValueError: When featurizer is not supported

    Returns:
        featurizer: Callable
    """
    if not isinstance(name, str):
        return name

    CALC_MAP = {k.lower(): v for k, v in _CALCULATORS.items()}
    name = name.lower()
    if name in FP_FUNCS.keys():
        featurizer = FPCalculator(name, **params)
    elif name == "desc3d":
        featurizer = RDKitDescriptors3D(**params)
    elif name == "desc2d":
        featurizer = RDKitDescriptors2D(**params)
    elif name == "mordred":
        featurizer = MordredDescriptors(**params)
    elif name == "cats":
        featurizer = CATS(**params)
    elif name == "cats2d":
        params["use_3d_distances"] = False
        featurizer = CATS(**params)
    elif name == "cats3d":
        params["use_3d_distances"] = True
        featurizer = CATS(**params)
    elif name == "pharm2d":
        featurizer = Pharmacophore2D(**params)
    elif name == "pharm3d":
        featurizer = Pharmacophore3D(**params)
    elif name.startswith("usr"):
        params["method"] = name
        featurizer = USRDescriptors(**params)
    elif name == "electroshape":
        featurizer = ElectroShapeDescriptors(**params)
    elif name in ["scaffoldkeys", "skeys", "scaffkeys"]:
        featurizer = ScaffoldKeyCalculator(**params)
    elif name == "none":
        featurizer = None
    # for any generic calculator that has been automatically registered
    elif name in CALC_MAP.keys():
        featurizer = CALC_MAP[name](**params)
    else:
        raise ValueError(f"{name} is not a supported internal featurizer")
    return featurizer

Serializable Calculator are the base abstract class for implementing your calculators.

SerializableCalculator

Bases: ABC

Interface to define a serializable calculator

Subclassing SerializableCalculator

When subclassing a calculator, you must implement the call method. If your calculator also implements a batch_compute method, it will be used by MoleculeTransformer to accelerate featurization.

from molfeat.calc import SerializableCalculator

class MyCalculator(SerializableCalculator):

    def __call__(self, mol, **kwargs):
        # you have to implement this
        ...

    def __len__(self):
        # you don't have to implement this but are encouraged to do so
        # this is used to determine the length of the output
        ...

    @property
    def columns(self):
        # you don't have to implement this
        # use this to return the name of each entry returned by your featurizer
        ...

    def batch_compute(self, mols:list, **dm_parallelized_kwargs):
        # you don't need to implement this
        # but you should if there is an efficient batching process
        # By default dm.parallelized arguments will also be passed as input
        ...
Source code in molfeat/calc/base.py
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
class SerializableCalculator(abc.ABC, metaclass=_CalculatorMeta):
    """Interface to define a serializable calculator

    ???+ tip "Subclassing SerializableCalculator"
        When subclassing a calculator, you must implement the __call__ method.
        If your calculator also implements a `batch_compute` method, it will be used
        by `MoleculeTransformer` to accelerate featurization.

        ```python
        from molfeat.calc import SerializableCalculator

        class MyCalculator(SerializableCalculator):

            def __call__(self, mol, **kwargs):
                # you have to implement this
                ...

            def __len__(self):
                # you don't have to implement this but are encouraged to do so
                # this is used to determine the length of the output
                ...

            @property
            def columns(self):
                # you don't have to implement this
                # use this to return the name of each entry returned by your featurizer
                ...

            def batch_compute(self, mols:list, **dm_parallelized_kwargs):
                # you don't need to implement this
                # but you should if there is an efficient batching process
                # By default dm.parallelized arguments will also be passed as input
                ...
        ```
    """

    @abc.abstractmethod
    def __call__(self, *args, **kwargs):
        pass

    @classmethod
    def from_state_dict(cls, state: dict, override_args: Optional[dict] = None):
        """Load from state dictionary

        Args:
            state: dictionary to use to create the the calculator
            overrride_args: optional dictionary of arguments to override the ones in the state dict
                at construction of the new object
        """
        cls_name = state.get("name", cls.__name__)
        module_name = state.get("module", cls.__module__)
        module = importlib.import_module(module_name)
        klass = getattr(module, cls_name)
        kwargs = state["args"].copy()
        kwargs.update(**(override_args or {}))
        return klass(**kwargs)

    def to_state_dict(self):
        """Get the state dictionary"""
        state_dict = {}
        state_dict["name"] = self.__class__.__name__
        state_dict["module"] = self.__class__.__module__
        state_dict["args"] = self.__getstate__()
        state_dict["_molfeat_version"] = MOLFEAT_VERSION
        # we would like to keep input arguments as is.
        signature = inspect.signature(self.__init__)
        val = {k: v.default for k, v in signature.parameters.items()}
        to_remove = [k for k in state_dict["args"] if k not in val.keys()]
        for k in to_remove:
            state_dict["args"].pop(k)
        return state_dict

    def to_state_json(self) -> str:
        """Output this instance as a JSON representation"""
        return json.dumps(self.to_state_dict())

    def to_state_yaml(self) -> str:
        """Output this instance as a YAML representation"""
        return yaml.dump(self.to_state_dict(), Dumper=yaml.SafeDumper)

    def to_state_json_file(self, filepath: str):
        """Save the state of this instance as a JSON file"""
        with fsspec.open(filepath, "w") as f:
            f.write(self.to_state_json())  # type: ignore

    def to_state_yaml_file(self, filepath: str):
        """Save the state of this instance as a YAML file"""
        with fsspec.open(filepath, "w") as f:
            f.write(self.to_state_yaml())  # type: ignore

    @classmethod
    def from_state_json(
        cls,
        state_json: str,
        override_args: Optional[dict] = None,
    ):
        state_dict = yaml.safe_load(state_json)
        return cls.from_state_dict(state_dict, override_args=override_args)

    @classmethod
    def from_state_yaml(
        cls,
        state_yaml: str,
        override_args: Optional[dict] = None,
    ):
        state_dict = yaml.load(state_yaml, Loader=yaml.SafeLoader)
        return cls.from_state_dict(state_dict, override_args=override_args)

    @classmethod
    def from_state_json_file(
        cls,
        filepath: str,
        override_args: Optional[dict] = None,
    ):
        with fsspec.open(filepath, "r") as f:
            featurizer = cls.from_state_json(f.read(), override_args=override_args)  # type: ignore
        return featurizer

    @classmethod
    def from_state_yaml_file(
        cls,
        filepath: str,
        override_args: Optional[dict] = None,
    ):
        with fsspec.open(filepath, "r") as f:
            featurizer = cls.from_state_yaml(f.read(), override_args=override_args)  # type: ignore
        return featurizer

    @classmethod
    def from_state_file(
        cls,
        state_path: str,
        override_args: Optional[dict] = None,
    ):
        if state_path.endswith("yaml") or state_path.endswith("yml"):
            return cls.from_state_yaml_file(filepath=state_path, override_args=override_args)
        elif state_path.endswith("json"):
            return cls.from_state_json_file(filepath=state_path, override_args=override_args)
        elif state_path.endswith("pkl"):
            with fsspec.open(state_path, "rb") as IN:
                return joblib.load(IN)
        raise ValueError(
            "Only files with 'yaml' or 'json' format are allowed. "
            "The filename must be ending with `yaml`, 'yml' or 'json'."
        )

from_state_dict(state, override_args=None) classmethod

Load from state dictionary

Parameters:

Name Type Description Default
state dict

dictionary to use to create the the calculator

required
overrride_args

optional dictionary of arguments to override the ones in the state dict at construction of the new object

required
Source code in molfeat/calc/base.py
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
@classmethod
def from_state_dict(cls, state: dict, override_args: Optional[dict] = None):
    """Load from state dictionary

    Args:
        state: dictionary to use to create the the calculator
        overrride_args: optional dictionary of arguments to override the ones in the state dict
            at construction of the new object
    """
    cls_name = state.get("name", cls.__name__)
    module_name = state.get("module", cls.__module__)
    module = importlib.import_module(module_name)
    klass = getattr(module, cls_name)
    kwargs = state["args"].copy()
    kwargs.update(**(override_args or {}))
    return klass(**kwargs)

to_state_dict()

Get the state dictionary

Source code in molfeat/calc/base.py
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
def to_state_dict(self):
    """Get the state dictionary"""
    state_dict = {}
    state_dict["name"] = self.__class__.__name__
    state_dict["module"] = self.__class__.__module__
    state_dict["args"] = self.__getstate__()
    state_dict["_molfeat_version"] = MOLFEAT_VERSION
    # we would like to keep input arguments as is.
    signature = inspect.signature(self.__init__)
    val = {k: v.default for k, v in signature.parameters.items()}
    to_remove = [k for k in state_dict["args"] if k not in val.keys()]
    for k in to_remove:
        state_dict["args"].pop(k)
    return state_dict

to_state_json()

Output this instance as a JSON representation

Source code in molfeat/calc/base.py
103
104
105
def to_state_json(self) -> str:
    """Output this instance as a JSON representation"""
    return json.dumps(self.to_state_dict())

to_state_json_file(filepath)

Save the state of this instance as a JSON file

Source code in molfeat/calc/base.py
111
112
113
114
def to_state_json_file(self, filepath: str):
    """Save the state of this instance as a JSON file"""
    with fsspec.open(filepath, "w") as f:
        f.write(self.to_state_json())  # type: ignore

to_state_yaml()

Output this instance as a YAML representation

Source code in molfeat/calc/base.py
107
108
109
def to_state_yaml(self) -> str:
    """Output this instance as a YAML representation"""
    return yaml.dump(self.to_state_dict(), Dumper=yaml.SafeDumper)

to_state_yaml_file(filepath)

Save the state of this instance as a YAML file

Source code in molfeat/calc/base.py
116
117
118
119
def to_state_yaml_file(self, filepath: str):
    """Save the state of this instance as a YAML file"""
    with fsspec.open(filepath, "w") as f:
        f.write(self.to_state_yaml())  # type: ignore

Fingerprints

FPCalculator

Bases: SerializableCalculator

Fingerprint bit calculator for a molecule

Source code in molfeat/calc/fingerprints.py
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
class FPCalculator(SerializableCalculator):
    """Fingerprint bit calculator for a molecule"""

    def __init__(
        self,
        method: str,
        length: Optional[int] = None,
        counting: bool = False,
        **method_params,
    ):
        """Compute the given fingeprint for a molecule

        !!! note
            For efficiency reason, count fingerprints are hashed and potentially
            re-folded and the count corresponds to the number of bits set to true

        Args:
            method (str): Name of the fingerprint method to use. See FPCalculator.available_fingerprints() for a list
            length (int, optional): Length of the fingerprint. Defaults to None.
                The default corresponds to the fingerpint default.
            counting (bool, optional): Whether to use the count version of the fingerprint
            method_params (dict): any parameters to the fingerprint algorithm.
                See FPCalculator.default_parameters(method) for all the parameters required by a given method.
        """
        self.method = method.lower()
        self.counting = counting or "-count" in self.method
        if self.counting and "-count" not in self.method:
            self.method = self.method + "-count"
        self.input_length = length
        if self.method not in FP_FUNCS:
            raise ValueError(f"Method {self.method} is not a supported featurizer")
        default_params = copy.deepcopy(FP_DEF_PARAMS[method])
        unknown_params = set(method_params.keys()).difference(set(default_params.keys()))
        if unknown_params:
            logger.error(f"Params: {unknown_params} are not valid for {method}")
        self.params = default_params
        self.params.update(
            {k: method_params[k] for k in method_params if k in default_params.keys()}
        )
        self._length = self._set_length(length)

    @staticmethod
    def available_fingerprints():
        """Get the list of available fingerprints"""
        return list(FP_FUNCS.keys())

    @staticmethod
    def default_parameters(method: str):
        """Get the default parameters for a given fingerprint method

        Args:
            method: name of the fingerprint method
        """
        return FP_DEF_PARAMS[method].copy()

    @property
    def columns(self):
        """
        Get the name of all the descriptors of this calculator
        """
        return [f"fp_{i}" for i in range(self._length)]

    def __len__(self):
        """Return the length of the calculator"""
        return self._length

    def _set_length(self, length=None):
        """Get the length of the featurizer"""
        fplen = length
        len_key = None
        if self.method == "maccs":
            fplen = 167
        elif self.method == "estate":
            fplen = 79
        elif self.method == "erg":
            fplen = 315
        elif self.method == "rdkit-count" and not fplen:
            fplen = 2048
        elif "nBits" in self.params.keys():
            len_key = "nBits"
            fplen = self.params[len_key]
        elif "n_permutations" in self.params.keys():
            # special case for mhfp
            len_key = "n_permutations"
            fplen = self.params[len_key]
        elif "fpSize" in self.params.keys():
            len_key = "fpSize"
            fplen = self.params[len_key]
        elif "dimensions" in self.params.keys():
            len_key = "dimensions"
            fplen = self.params[len_key]
        if len_key is not None and length:
            self.params[len_key] = length
            fplen = length
        return fplen

    def __call__(self, mol: Union[dm.Mol, str], raw: bool = False):
        r"""
        Compute the Fingerprint of a molecule

        Args:
            mol: the molecule of interest
            raw: whether to keep original datatype or convert to numpy. Useful for rdkit's similarity functions

        Returns:
            props (np.ndarray): list of computed rdkit molecular descriptors
        """
        mol = dm.to_mol(mol)
        fp_val = FP_FUNCS[self.method](mol, **self.params)
        if self.counting:
            fp_val = fold_count_fp(fp_val, self._length)
        if not raw:
            fp_val = to_numpy(fp_val)
        if self.counting and raw:
            # converint the counted values to SparseInt again
            fp_val = to_fp(fp_val, bitvect=False)
        return fp_val

    def __getstate__(self):
        # EN: note that the state is standardized with all the parameter
        # because of the possibility of default changing after
        state = {}
        state["length"] = self.input_length
        state["input_length"] = self.input_length
        state["method"] = self.method
        state["counting"] = self.counting
        state["params"] = self.params
        return state

    def __setstate__(self, state: dict):
        """Set the state of the featurizer"""
        self.__dict__.update(state)
        self._length = self._set_length(self.input_length)

    def to_state_dict(self):
        """Get the state dictionary"""
        state_dict = super().to_state_dict()
        cur_params = self.params
        default_params = copy.deepcopy(FP_DEF_PARAMS[state_dict["args"]["method"]])
        state_dict["args"].update(
            {
                k: cur_params[k]
                for k in cur_params
                if (cur_params[k] != default_params[k] and cur_params[k] is not None)
            }
        )
        # we want to keep all the additional parameters in the state dict
        return state_dict

columns property

Get the name of all the descriptors of this calculator

__call__(mol, raw=False)

Compute the Fingerprint of a molecule

Parameters:

Name Type Description Default
mol Union[Mol, str]

the molecule of interest

required
raw bool

whether to keep original datatype or convert to numpy. Useful for rdkit's similarity functions

False

Returns:

Name Type Description
props ndarray

list of computed rdkit molecular descriptors

Source code in molfeat/calc/fingerprints.py
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
def __call__(self, mol: Union[dm.Mol, str], raw: bool = False):
    r"""
    Compute the Fingerprint of a molecule

    Args:
        mol: the molecule of interest
        raw: whether to keep original datatype or convert to numpy. Useful for rdkit's similarity functions

    Returns:
        props (np.ndarray): list of computed rdkit molecular descriptors
    """
    mol = dm.to_mol(mol)
    fp_val = FP_FUNCS[self.method](mol, **self.params)
    if self.counting:
        fp_val = fold_count_fp(fp_val, self._length)
    if not raw:
        fp_val = to_numpy(fp_val)
    if self.counting and raw:
        # converint the counted values to SparseInt again
        fp_val = to_fp(fp_val, bitvect=False)
    return fp_val

__init__(method, length=None, counting=False, **method_params)

Compute the given fingeprint for a molecule

Note

For efficiency reason, count fingerprints are hashed and potentially re-folded and the count corresponds to the number of bits set to true

Parameters:

Name Type Description Default
method str

Name of the fingerprint method to use. See FPCalculator.available_fingerprints() for a list

required
length int

Length of the fingerprint. Defaults to None. The default corresponds to the fingerpint default.

None
counting bool

Whether to use the count version of the fingerprint

False
method_params dict

any parameters to the fingerprint algorithm. See FPCalculator.default_parameters(method) for all the parameters required by a given method.

{}
Source code in molfeat/calc/fingerprints.py
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
def __init__(
    self,
    method: str,
    length: Optional[int] = None,
    counting: bool = False,
    **method_params,
):
    """Compute the given fingeprint for a molecule

    !!! note
        For efficiency reason, count fingerprints are hashed and potentially
        re-folded and the count corresponds to the number of bits set to true

    Args:
        method (str): Name of the fingerprint method to use. See FPCalculator.available_fingerprints() for a list
        length (int, optional): Length of the fingerprint. Defaults to None.
            The default corresponds to the fingerpint default.
        counting (bool, optional): Whether to use the count version of the fingerprint
        method_params (dict): any parameters to the fingerprint algorithm.
            See FPCalculator.default_parameters(method) for all the parameters required by a given method.
    """
    self.method = method.lower()
    self.counting = counting or "-count" in self.method
    if self.counting and "-count" not in self.method:
        self.method = self.method + "-count"
    self.input_length = length
    if self.method not in FP_FUNCS:
        raise ValueError(f"Method {self.method} is not a supported featurizer")
    default_params = copy.deepcopy(FP_DEF_PARAMS[method])
    unknown_params = set(method_params.keys()).difference(set(default_params.keys()))
    if unknown_params:
        logger.error(f"Params: {unknown_params} are not valid for {method}")
    self.params = default_params
    self.params.update(
        {k: method_params[k] for k in method_params if k in default_params.keys()}
    )
    self._length = self._set_length(length)

__len__()

Return the length of the calculator

Source code in molfeat/calc/fingerprints.py
260
261
262
def __len__(self):
    """Return the length of the calculator"""
    return self._length

__setstate__(state)

Set the state of the featurizer

Source code in molfeat/calc/fingerprints.py
327
328
329
330
def __setstate__(self, state: dict):
    """Set the state of the featurizer"""
    self.__dict__.update(state)
    self._length = self._set_length(self.input_length)

available_fingerprints() staticmethod

Get the list of available fingerprints

Source code in molfeat/calc/fingerprints.py
239
240
241
242
@staticmethod
def available_fingerprints():
    """Get the list of available fingerprints"""
    return list(FP_FUNCS.keys())

default_parameters(method) staticmethod

Get the default parameters for a given fingerprint method

Parameters:

Name Type Description Default
method str

name of the fingerprint method

required
Source code in molfeat/calc/fingerprints.py
244
245
246
247
248
249
250
251
@staticmethod
def default_parameters(method: str):
    """Get the default parameters for a given fingerprint method

    Args:
        method: name of the fingerprint method
    """
    return FP_DEF_PARAMS[method].copy()

to_state_dict()

Get the state dictionary

Source code in molfeat/calc/fingerprints.py
332
333
334
335
336
337
338
339
340
341
342
343
344
345
def to_state_dict(self):
    """Get the state dictionary"""
    state_dict = super().to_state_dict()
    cur_params = self.params
    default_params = copy.deepcopy(FP_DEF_PARAMS[state_dict["args"]["method"]])
    state_dict["args"].update(
        {
            k: cur_params[k]
            for k in cur_params
            if (cur_params[k] != default_params[k] and cur_params[k] is not None)
        }
    )
    # we want to keep all the additional parameters in the state dict
    return state_dict

CATS

CATS 2D and 3D implementation based on original work by Rajarshi Guha rguha@indiana.edu 08/26/07 and Chris Arthur 1/11/2015 Rdkit port This version modernizes the code, improve performance, add supports for 3D as well as allowing distance binning. see: https://masterchemoinfo.u-strasbg.fr/Documents/Conferences/Lecture1_Pharmacophores_Schneider.pdf

CATS

Bases: SerializableCalculator

Cats descriptors calculator based on PPPs (potential pharmacophore points). Can be either 2D or 3D.

!!! note: We need to consider all pairwise combination of the 6 PPPs described in CATS2D.SMARTS which would be $P(6,2) + 6$. However, as we only consider lexicographic order, the total size is then $ rac{P(6,2)}{2} + 6 = 21$, explaining the size of CATS2D.DESCRIPTORS

Tip

The CATS descriptor are sensitive to the number of atoms in a molecule, meaning, you would get different results if you add or remove hydrogen atoms

Source code in molfeat/calc/cats.py
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
class CATS(SerializableCalculator):
    """Cats descriptors calculator based on PPPs (potential pharmacophore points). Can be either 2D or 3D.

    !!! note:
        We need to consider all pairwise combination of the 6 PPPs described in `CATS2D.SMARTS`
        which would be $P(6,2) + 6$. However, as we only consider lexicographic order, the total size
        is then $\frac{P(6,2)}{2} + 6 = 21$, explaining the size of `CATS2D.DESCRIPTORS`

    !!! tip
        The CATS descriptor are sensitive to the number of atoms in a molecule, meaning, you would get different
        results if you add or remove hydrogen atoms

    """

    SMARTS = {
        "D": ["[!$([#6,H0,-,-2,-3])]"],
        "A": ["[!$([#6,F,Cl,Br,I,o,s,nX3,#7v5,#15v5,#16v4,#16v6,*+1,*+2,*+3])]"],
        "P": ["[*+]", "[#7H2]"],
        "N": ["[*-]", "[C&$(C(=O)O),P&$(P(=O)),S&$(S(=O)O)]"],
        "L": [
            "[Cl,Br,I]",
            "[S;D2;$(S(C)(C))]",
            "[C;D2;$(C(=C)(=C))]",
            "[C;D3;$(C(=C)(C)(C))]",
            "[C;D4;$(C(C)(C)(C)(C))]",
            "[C;D3;H1;$(C(C)(C)(C))]",
            "[C;D2;H2;$(C(C)(C))]",
        ],
        "R": ["[a]"],
    }

    DESCRIPTORS = [
        "DD",
        "AD",
        "DP",
        "DN",
        "DL",
        "DR",
        "AA",
        "AP",
        "AN",
        "AL",
        "AR",
        "PP",
        "NP",
        "LP",
        "PR",
        "NN",
        "LN",
        "NR",
        "LL",
        "LR",
        "RR",
    ]

    MAX_DIST_DEFAULT_2D = 8
    MAX_DIST_DEFAULT_3D = 5

    def __init__(
        self,
        max_dist: Union[float, int] = None,
        bins: List[int] = None,
        scale: str = "raw",
        use_3d_distances: bool = False,
        **kwargs,
    ):
        """Calculator for the CATS descriptors.

        `max_dist` and `bins` will both determine the length of the fingerprint vector,
        which you can get by calling `len(calc)`

        Args:
            max_dist: Maximum distance between pairs. When set to None, the default for 2D is
                set to `max_dist=8` and for 3D to `max_dist=5`.
            bins: Bins to use. Defaults to equal spacing `[0, max_dist[`.
            scale: How to scale the values. Supported values are:
                 - 'raw' for the raw values.
                 - 'num' for values normalized by the number of atoms.
                 - 'count' for scaling based on occurence of the PPP.
            use_3d_distances: Whether to use the 3D distances instead of the topological distances.
                If set to True, the input molecules must contain a conformer.
            kwargs: silently ignored extra parameters for compatibility with other calculators.
        """

        # Set the max_dist default is set to None
        if max_dist is None:
            if use_3d_distances:
                max_dist = CATS.MAX_DIST_DEFAULT_3D
            else:
                max_dist = CATS.MAX_DIST_DEFAULT_2D

        self.max_dist = max_dist
        self.use_3d_distances = use_3d_distances

        if bins is None:
            bins = list(np.arange(1, np.floor(self.max_dist + 1), 1))

        # we don't allow interaction that exceed our distance threshold.
        bins = [x for x in bins if x <= self.max_dist]

        # we start distance indexing at 0
        if 0 not in bins:
            bins += [0]

        self.bins = list(sorted(bins))

        self.scale = scale

        self._set_columns()

    def _set_columns(self):
        self._columns = []
        for label in self.DESCRIPTORS:
            for i in range(len(self.bins)):
                self._columns.append(f"{label}.bins-{i}")

    @classmethod
    @functools.lru_cache(maxsize=None)
    def _pattern_to_mols(cls, smarts_dict=None):
        """Convert dict of list of smarts to rdkit molecules"""

        if smarts_dict is None:
            smarts_dict = cls.SMARTS

        smarts_mols = ddict(list)
        for label, patterns in smarts_dict.items():
            patterns = [dm.from_smarts(patt) for patt in patterns]
            smarts_mols[label] = patterns

        return smarts_mols

    def _get_pcore_group(self, mol: Union[dm.Mol, str]):
        """
        Assign a PPP (potential pharmacophore points) to individual atoms of a molecule.

        !!! note
            The return value is a list of length `N_atoms` of the
            input molecule. The i'th element of the list contains
            a list of PPP labels that were identified for the i'th atom

        Args:
            mol: the molecule of interest

        Returns:
            ppp_labels (List[list]): list of all PPP labels for each atoms
        """

        smarts_mols = CATS._pattern_to_mols()

        ppp_labels = ["" for x in range(0, mol.GetNumAtoms())]
        for label, patterns in smarts_mols.items():
            for pattern in patterns:
                matched = False
                for matchbase in mol.GetSubstructMatches(pattern, uniquify=True):
                    for idx in matchbase:
                        if ppp_labels[idx] == "":
                            ppp_labels[idx] = [label]
                        else:
                            tmp = ppp_labels[idx]
                            tmp.append(label)
                            ppp_labels[idx] = tmp
                    matched = True
                if matched:
                    break
        return ppp_labels

    def _get_ppp_matrix(self, n_atoms: int, ppp_labels: List):
        """Compute PPP matrix from label list

        Args:
            n_atoms (int): number of atoms
            ppp_labels (list): PPP labels returned by

        Returns:
            pppm (dict): PPP matrix where the keys are the coordinate
        """

        pppm = {}
        for i in range(0, n_atoms):
            ppp_i = ppp_labels[i]
            if ppp_i == "":
                continue
            for j in range(0, n_atoms):
                ppp_j = ppp_labels[j]
                if ppp_j == "":
                    continue
                pairs = []
                for x in ppp_i:
                    for y in ppp_j:
                        if (x, y) not in pairs and (y, x) not in pairs:
                            ## make sure to add the labels in increasing
                            ## lexicographical order
                            if x < y:
                                tmp = (x, y)
                            else:
                                tmp = (y, x)
                            pairs.append(tmp)
                pppm[(i, j)] = pairs
        return pppm

    def _calculate(self, mol, dist_mat):
        """Calculate the CATS2D descriptors for current molecule, given a distance matrix"""

        n_atoms = mol.GetNumAtoms()
        ppp_labels = self._get_pcore_group(mol)
        ppp_mat = self._get_ppp_matrix(n_atoms, ppp_labels)

        # get the counturence of each of the PPP's
        ppp_count = dict(zip(["D", "N", "A", "P", "L", "R"], [0] * 6))
        for label in ppp_labels:
            for ppp in label:
                ppp_count[ppp] = ppp_count[ppp] + 1

        # lets calculate the CATS2D raw descriptor
        # bins: a, b, c ==> [a, b], [b, c], [c, *]
        # a is always 0
        desc = [[0 for x in range(len(self.bins))] for x in range(0, len(self.DESCRIPTORS))]
        for (x, y), labels in ppp_mat.items():
            dist = dist_mat[x, y]
            # ignore all interactions greater than the max distance we set
            # we cannot have negative distance
            if dist > self.max_dist or dist < 0:
                continue

            for pair in labels:
                idx = self.DESCRIPTORS.index(f"{pair[0]}{pair[1]}")
                vals = desc[idx]
                dist_bin = np.digitize(dist, self.bins)
                # indexing at 0
                vals[dist_bin - 1] += 1
                desc[idx] = vals

        if self.scale == "num":
            for row in range(0, len(desc)):
                for col in range(0, len(desc[0])):
                    desc[row][col] = float(desc[row][col]) / n_atoms

        elif self.scale == "count":
            #  get the scaling factors
            facs = [0] * len(self.DESCRIPTORS)
            count = 0
            for ppp in self.DESCRIPTORS:
                facs[count] = ppp_count[ppp[0]] + ppp_count[ppp[1]]
                count += 1

            # each row in desc corresponds to a PPP pair
            # so the scale factor is constant over cols of a row
            count = 0
            for i in range(0, len(desc)):
                if facs[i] == 0:
                    continue
                for j in range(0, len(desc[0])):
                    desc[i][j] = desc[i][j] / float(facs[i])

        res = []
        for row in desc:
            for col in row:
                res.append(col)
        return res

    def __len__(self):
        """Return the length of the calculator"""
        return len(self._columns)

    def __call__(self, mol: Union[dm.Mol, str], conformer_id: int = -1):
        """Get CATS 2D descriptors for a molecule

        Args:
            mol: the molecule of interest.
            conformer_id: Optional conformer id. Only relevant when `use_3d_distances`
                is set to True.

        Returns:
            props (np.ndarray): list of computed rdkit molecular descriptors
        """

        mol = dm.to_mol(mol)

        if self.use_3d_distances:
            if mol.GetNumConformers() < 1:  # type: ignore
                raise ValueError("Expected a molecule with conformers information.")

            dist_mat = Get3DDistanceMatrix(mol, confId=conformer_id)

        else:
            dist_mat = GetDistanceMatrix(mol).astype(int)

        out = self._calculate(mol, dist_mat)
        return to_numpy(out)

    @property
    def columns(self):
        """Get the descriptors columns"""
        return self._columns

    def __getstate__(self):
        """Serialize the class for pickling."""
        state = {}
        state["max_dist"] = self.max_dist
        state["bins"] = self.bins
        state["scale"] = self.scale
        state["use_3d_distances"] = self.use_3d_distances
        return state

    def __setstate__(self, state: dict):
        """Reload the class from pickling."""
        self.__dict__.update(state)
        self._set_columns()

columns property

Get the descriptors columns

__call__(mol, conformer_id=-1)

Get CATS 2D descriptors for a molecule

Parameters:

Name Type Description Default
mol Union[Mol, str]

the molecule of interest.

required
conformer_id int

Optional conformer id. Only relevant when use_3d_distances is set to True.

-1

Returns:

Name Type Description
props ndarray

list of computed rdkit molecular descriptors

Source code in molfeat/calc/cats.py
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
def __call__(self, mol: Union[dm.Mol, str], conformer_id: int = -1):
    """Get CATS 2D descriptors for a molecule

    Args:
        mol: the molecule of interest.
        conformer_id: Optional conformer id. Only relevant when `use_3d_distances`
            is set to True.

    Returns:
        props (np.ndarray): list of computed rdkit molecular descriptors
    """

    mol = dm.to_mol(mol)

    if self.use_3d_distances:
        if mol.GetNumConformers() < 1:  # type: ignore
            raise ValueError("Expected a molecule with conformers information.")

        dist_mat = Get3DDistanceMatrix(mol, confId=conformer_id)

    else:
        dist_mat = GetDistanceMatrix(mol).astype(int)

    out = self._calculate(mol, dist_mat)
    return to_numpy(out)

__getstate__()

Serialize the class for pickling.

Source code in molfeat/calc/cats.py
320
321
322
323
324
325
326
327
def __getstate__(self):
    """Serialize the class for pickling."""
    state = {}
    state["max_dist"] = self.max_dist
    state["bins"] = self.bins
    state["scale"] = self.scale
    state["use_3d_distances"] = self.use_3d_distances
    return state

__init__(max_dist=None, bins=None, scale='raw', use_3d_distances=False, **kwargs)

Calculator for the CATS descriptors.

max_dist and bins will both determine the length of the fingerprint vector, which you can get by calling len(calc)

Parameters:

Name Type Description Default
max_dist Union[float, int]

Maximum distance between pairs. When set to None, the default for 2D is set to max_dist=8 and for 3D to max_dist=5.

None
bins List[int]

Bins to use. Defaults to equal spacing [0, max_dist[.

None
scale str

How to scale the values. Supported values are: - 'raw' for the raw values. - 'num' for values normalized by the number of atoms. - 'count' for scaling based on occurence of the PPP.

'raw'
use_3d_distances bool

Whether to use the 3D distances instead of the topological distances. If set to True, the input molecules must contain a conformer.

False
kwargs

silently ignored extra parameters for compatibility with other calculators.

{}
Source code in molfeat/calc/cats.py
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
def __init__(
    self,
    max_dist: Union[float, int] = None,
    bins: List[int] = None,
    scale: str = "raw",
    use_3d_distances: bool = False,
    **kwargs,
):
    """Calculator for the CATS descriptors.

    `max_dist` and `bins` will both determine the length of the fingerprint vector,
    which you can get by calling `len(calc)`

    Args:
        max_dist: Maximum distance between pairs. When set to None, the default for 2D is
            set to `max_dist=8` and for 3D to `max_dist=5`.
        bins: Bins to use. Defaults to equal spacing `[0, max_dist[`.
        scale: How to scale the values. Supported values are:
             - 'raw' for the raw values.
             - 'num' for values normalized by the number of atoms.
             - 'count' for scaling based on occurence of the PPP.
        use_3d_distances: Whether to use the 3D distances instead of the topological distances.
            If set to True, the input molecules must contain a conformer.
        kwargs: silently ignored extra parameters for compatibility with other calculators.
    """

    # Set the max_dist default is set to None
    if max_dist is None:
        if use_3d_distances:
            max_dist = CATS.MAX_DIST_DEFAULT_3D
        else:
            max_dist = CATS.MAX_DIST_DEFAULT_2D

    self.max_dist = max_dist
    self.use_3d_distances = use_3d_distances

    if bins is None:
        bins = list(np.arange(1, np.floor(self.max_dist + 1), 1))

    # we don't allow interaction that exceed our distance threshold.
    bins = [x for x in bins if x <= self.max_dist]

    # we start distance indexing at 0
    if 0 not in bins:
        bins += [0]

    self.bins = list(sorted(bins))

    self.scale = scale

    self._set_columns()

__len__()

Return the length of the calculator

Source code in molfeat/calc/cats.py
285
286
287
def __len__(self):
    """Return the length of the calculator"""
    return len(self._columns)

__setstate__(state)

Reload the class from pickling.

Source code in molfeat/calc/cats.py
329
330
331
332
def __setstate__(self, state: dict):
    """Reload the class from pickling."""
    self.__dict__.update(state)
    self._set_columns()

Pharmacophore

Pharmacophore2D

Bases: SerializableCalculator

2D Pharmacophore.

The fingerprint is computed using Generate.Gen2DFingerprint from RDKit.

An explanation of pharmacophore fingerprints and how the bits are set is available in the RDKit book. In particular the following figure describes the process. Pharmacophore{ align=left }

Source code in molfeat/calc/pharmacophore.py
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
class Pharmacophore2D(SerializableCalculator):
    """2D Pharmacophore.

    The fingerprint is computed using `Generate.Gen2DFingerprint` from RDKit.

    An explanation of pharmacophore fingerprints and how the bits are set
    is available in the RDKit book. In particular the following figure describes the process.
    ![Pharmacophore](https://www.rdkit.org/docs/_images/picture_10.jpg){ align=left }
    """

    def __init__(
        self,
        factory: Union[str, MolChemicalFeatureFactory] = "pmapper",
        length: Optional[int] = 2048,
        useCounts: bool = None,
        minPointCount: int = None,
        maxPointCount: int = None,
        shortestPathsOnly: bool = None,
        includeBondOrder: bool = None,
        skipFeats: List[str] = None,
        trianglePruneBins: bool = None,
        bins: List[Tuple[int, int]] = None,
        **kwargs,
    ):
        """Pharmacophore computation.

        Args:
            factory: Which features factory to use. One of "default", "cats", "gobbi" , "pmapper" or path
                to a feature definition or a feature factory object
            length: Optional desired length. If provided, the fp will be refold or padded to that length.
                If set to None, fallback to the default for the provided sig factory.
            minPointCount: Minimum number of points.
            maxPointCount: Maximum number of points.
            trianglePruneBins: Whether to prune the triangle inequality.
            includeBondOrder: Whether to consider bond order.
            shortestPathsOnly: Whether to only use the shortest path between pharmacophores.
            useCounts: Whether take into account the count information. This will also impact how the folding works.
            bins: Bins to use.
        """

        self.factory = factory
        self.useCounts = useCounts
        self.minPointCount = minPointCount
        self.maxPointCount = maxPointCount
        self.shortestPathsOnly = shortestPathsOnly
        self.includeBondOrder = includeBondOrder
        self.skipFeats = skipFeats
        self.trianglePruneBins = trianglePruneBins
        self.bins = bins

        self.length = length

        self._init_sig_factory()

    def __call__(self, mol: Union[dm.Mol, str], raw: bool = False):
        """Compute the Pharmacophore fingeprint for the input molecule.

        Args:
            mol: the molecule of interest
            raw: Whether to return the raw fingerprint or a Numpy array.

        Returns:
            fp: the computed fingerprint as a Numpy array or as a raw object.
        """

        # Get a molecule
        mol = dm.to_mol(mol)

        if mol is None:
            raise ValueError("The input molecule is not valid.")

        # Get distance matrix
        use_bond_order = self.sig_factory.includeBondOrder
        d_mat = rdmolops.GetDistanceMatrix(mol, use_bond_order)

        # Generate the fingerprint
        fp = Generate.Gen2DFingerprint(mol, self.sig_factory, dMat=d_mat)

        # Posprocessing
        if self.length and self._should_fold:
            # refold the fingerprint
            fp = fold_count_fp(fp, dim=self.length, binary=not (self.useCounts or False))
            if raw:
                fp = to_fp(fp, bitvect=True)

        if not raw:
            fp = to_numpy(fp)

        return fp

    def _init_sig_factory(self):
        """Init the feature factory for this pharmacophore."""

        self.sig_factory = get_sig_factory(
            self.factory,
            useCounts=self.useCounts,
            minPointCount=self.minPointCount,
            maxPointCount=self.maxPointCount,
            shortestPathsOnly=self.shortestPathsOnly,
            includeBondOrder=self.includeBondOrder,
            skipFeats=self.skipFeats,
            trianglePruneBins=self.trianglePruneBins,
            bins=self.bins,
        )

        # Reinject used params to the class attributes
        # It might be useful in case the default values are changed
        # and when serializing the object.
        self.useCounts = self.sig_factory.useCounts
        self.minPointCount = self.sig_factory.minPointCount
        self.maxPointCount = self.sig_factory.maxPointCount
        self.shortestPathsOnly = self.sig_factory.shortestPathsOnly
        self.includeBondOrder = self.sig_factory.includeBondOrder
        self.skipFeats = self.sig_factory.skipFeats
        self.trianglePruneBins = self.sig_factory.trianglePruneBins
        self.bins = self.sig_factory.GetBins()

    @property
    @functools.lru_cache(maxsize=None)
    def _should_fold(self):
        return self.sig_factory.GetSigSize() != len(self)

    @property
    def feature_factory(self):
        return self.sig_factory.featFactory

    def __len__(self):
        """Returns the length of the pharmacophore"""
        return self.length or self.sig_factory.GetSigSize()

    @property
    def columns(self):
        """Get the name of all the descriptors of this calculator."""

        if not self.length:
            return [self.sig_factory.GetBitDescription(x) for x in range(len(self))]
        else:
            return [f"Desc:{i}" for i in range(self.length)]

    def __getstate__(self):
        """Serialize the class for pickling."""
        state = {}
        state["factory"] = self.factory
        state["useCounts"] = self.useCounts
        state["minPointCount"] = self.minPointCount
        state["maxPointCount"] = self.maxPointCount
        state["shortestPathsOnly"] = self.shortestPathsOnly
        state["includeBondOrder"] = self.includeBondOrder
        state["skipFeats"] = self.skipFeats
        state["trianglePruneBins"] = self.trianglePruneBins
        state["bins"] = self.bins
        state["length"] = self.length
        return state

    def __setstate__(self, state: dict):
        """Reload the class from pickling."""
        self.__dict__.update(state)
        self._init_sig_factory()

columns property

Get the name of all the descriptors of this calculator.

__call__(mol, raw=False)

Compute the Pharmacophore fingeprint for the input molecule.

Parameters:

Name Type Description Default
mol Union[Mol, str]

the molecule of interest

required
raw bool

Whether to return the raw fingerprint or a Numpy array.

False

Returns:

Name Type Description
fp

the computed fingerprint as a Numpy array or as a raw object.

Source code in molfeat/calc/pharmacophore.py
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
def __call__(self, mol: Union[dm.Mol, str], raw: bool = False):
    """Compute the Pharmacophore fingeprint for the input molecule.

    Args:
        mol: the molecule of interest
        raw: Whether to return the raw fingerprint or a Numpy array.

    Returns:
        fp: the computed fingerprint as a Numpy array or as a raw object.
    """

    # Get a molecule
    mol = dm.to_mol(mol)

    if mol is None:
        raise ValueError("The input molecule is not valid.")

    # Get distance matrix
    use_bond_order = self.sig_factory.includeBondOrder
    d_mat = rdmolops.GetDistanceMatrix(mol, use_bond_order)

    # Generate the fingerprint
    fp = Generate.Gen2DFingerprint(mol, self.sig_factory, dMat=d_mat)

    # Posprocessing
    if self.length and self._should_fold:
        # refold the fingerprint
        fp = fold_count_fp(fp, dim=self.length, binary=not (self.useCounts or False))
        if raw:
            fp = to_fp(fp, bitvect=True)

    if not raw:
        fp = to_numpy(fp)

    return fp

__getstate__()

Serialize the class for pickling.

Source code in molfeat/calc/pharmacophore.py
179
180
181
182
183
184
185
186
187
188
189
190
191
192
def __getstate__(self):
    """Serialize the class for pickling."""
    state = {}
    state["factory"] = self.factory
    state["useCounts"] = self.useCounts
    state["minPointCount"] = self.minPointCount
    state["maxPointCount"] = self.maxPointCount
    state["shortestPathsOnly"] = self.shortestPathsOnly
    state["includeBondOrder"] = self.includeBondOrder
    state["skipFeats"] = self.skipFeats
    state["trianglePruneBins"] = self.trianglePruneBins
    state["bins"] = self.bins
    state["length"] = self.length
    return state

__init__(factory='pmapper', length=2048, useCounts=None, minPointCount=None, maxPointCount=None, shortestPathsOnly=None, includeBondOrder=None, skipFeats=None, trianglePruneBins=None, bins=None, **kwargs)

Pharmacophore computation.

Parameters:

Name Type Description Default
factory Union[str, MolChemicalFeatureFactory]

Which features factory to use. One of "default", "cats", "gobbi" , "pmapper" or path to a feature definition or a feature factory object

'pmapper'
length Optional[int]

Optional desired length. If provided, the fp will be refold or padded to that length. If set to None, fallback to the default for the provided sig factory.

2048
minPointCount int

Minimum number of points.

None
maxPointCount int

Maximum number of points.

None
trianglePruneBins bool

Whether to prune the triangle inequality.

None
includeBondOrder bool

Whether to consider bond order.

None
shortestPathsOnly bool

Whether to only use the shortest path between pharmacophores.

None
useCounts bool

Whether take into account the count information. This will also impact how the folding works.

None
bins List[Tuple[int, int]]

Bins to use.

None
Source code in molfeat/calc/pharmacophore.py
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
def __init__(
    self,
    factory: Union[str, MolChemicalFeatureFactory] = "pmapper",
    length: Optional[int] = 2048,
    useCounts: bool = None,
    minPointCount: int = None,
    maxPointCount: int = None,
    shortestPathsOnly: bool = None,
    includeBondOrder: bool = None,
    skipFeats: List[str] = None,
    trianglePruneBins: bool = None,
    bins: List[Tuple[int, int]] = None,
    **kwargs,
):
    """Pharmacophore computation.

    Args:
        factory: Which features factory to use. One of "default", "cats", "gobbi" , "pmapper" or path
            to a feature definition or a feature factory object
        length: Optional desired length. If provided, the fp will be refold or padded to that length.
            If set to None, fallback to the default for the provided sig factory.
        minPointCount: Minimum number of points.
        maxPointCount: Maximum number of points.
        trianglePruneBins: Whether to prune the triangle inequality.
        includeBondOrder: Whether to consider bond order.
        shortestPathsOnly: Whether to only use the shortest path between pharmacophores.
        useCounts: Whether take into account the count information. This will also impact how the folding works.
        bins: Bins to use.
    """

    self.factory = factory
    self.useCounts = useCounts
    self.minPointCount = minPointCount
    self.maxPointCount = maxPointCount
    self.shortestPathsOnly = shortestPathsOnly
    self.includeBondOrder = includeBondOrder
    self.skipFeats = skipFeats
    self.trianglePruneBins = trianglePruneBins
    self.bins = bins

    self.length = length

    self._init_sig_factory()

__len__()

Returns the length of the pharmacophore

Source code in molfeat/calc/pharmacophore.py
166
167
168
def __len__(self):
    """Returns the length of the pharmacophore"""
    return self.length or self.sig_factory.GetSigSize()

__setstate__(state)

Reload the class from pickling.

Source code in molfeat/calc/pharmacophore.py
194
195
196
197
def __setstate__(self, state: dict):
    """Reload the class from pickling."""
    self.__dict__.update(state)
    self._init_sig_factory()

Pharmacophore3D

Bases: SerializableCalculator

3D Pharmacophore.

The fingerprint is computed using pmapper.

This featurizer supports building a consensus pharmacophore from a set of molecules.

Source code in molfeat/calc/pharmacophore.py
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
class Pharmacophore3D(SerializableCalculator):
    """3D Pharmacophore.

    The fingerprint is computed using [`pmapper`](https://github.com/DrrDom/pmapper).

    This featurizer supports building a consensus pharmacophore from a set of molecules.
    """

    def __init__(
        self,
        factory: Union[str, MolChemicalFeatureFactory] = "pmapper",
        length: int = 2048,
        bin_step: float = 1,
        min_features: int = 2,
        max_features: int = 3,
        use_modulo: bool = True,
        tolerance: float = 0,
    ):
        """Pharmacophore computation.

        Args:
            factory: Which features factory to use. One of "default", "cats", "gobbi" , "pmapper" or path
                to a feature definition or a feature factory object
            length: Optional desired length. If provided, the fp will be refold or padded to that length.
                If set to None, fallback to the default for the provided sig factory.
            bin_step: Bin step to use.
            min_features: Minimum number of features to use.
            max_features: Maximum number of features to use.
            use_modulo: whether to use modulo to compute the pharmacophore fingerprint
            tolerance: tolerance value to use when computing the pharmacophore fingerprint
        """

        self.factory = factory
        self.length = length
        self.bin_step = bin_step
        self.min_features = min_features
        self.max_features = max_features
        self.use_modulo = use_modulo
        self.tolerance = tolerance

        self._init_feature_factory()

    def __call__(self, mol: Union[dm.Mol, str], conformer_id: int = -1, raw: bool = False):
        """Compute the Pharmacophore fingeprint for the input molecule.

        Args:
            mol: the molecule of interest
            conformer_id: the conformer id to use.
            raw: Whether to return the raw fingerprint or a Numpy array.

        Returns:
            fp: the computed fingerprint as a Numpy array.
        """

        # Get a molecule
        mol = dm.to_mol(mol)

        if mol is None:
            raise ValueError("The input molecule is not valid.")

        if mol.GetNumConformers() < 1:  # type: ignore
            raise ValueError("Expected a molecule with conformers information.")

        # Get the features for the mol
        features = self.get_features(mol, conformer_id=conformer_id)

        # Convert features dataframe to coordinates
        if features.empty:
            features_coords = []
        else:
            features_coords = features[["feature_name", "coords"]].values.tolist()

        # Compute the fingerprint
        fp = self.compute_fp_from_coords(features_coords, raw=raw)

        return fp

    def consensus_fp(
        self,
        mols: List[dm.Mol],
        align: bool = True,
        conformer_id: int = -1,
        copy: bool = True,
        min_samples_ratio: float = 0.5,
        eps: float = 2,
        raw: bool = False,
        **cluster_kwargs,
    ):
        """Compute a consensus fingerprint from a list of molecules.

        Args:
            mols: a list of molecules.
            align: Whether to align the conformers of the molecules.
            conformer_id: Optional conformer id.
            copy: Whether to copy the molecules before clustering.
            min_samples_ratio: Percentages of mols that must contain a pharmacophoric point
                to be considered as a core point.
            eps: The maximum distance between two samples for one to be considered as
                in the neighborhood of the other.
            raw: Whether to return the raw fingerprint or a Numpy array.
            cluster_kwargs: additional keyword arguments for the clustering algorithm.
        """

        # Get all the features
        features = self.get_features_from_many(
            mols,
            keep_mols=True,
            align=align,
            conformer_id=conformer_id,
            copy=copy,
        )

        # Retrieve the aligned molecules
        mols = features.groupby("mol_index").first()["mol"].tolist()
        # Cluster the features
        clustered_features = self.cluster_features(
            features, min_samples_ratio=min_samples_ratio, eps=eps, **cluster_kwargs
        )
        # Convert features dataframe to coordinates
        if clustered_features.empty:
            features_coords = []
        else:
            features_coords = clustered_features[["feature_name", "coords"]].values.tolist()
        # Compute the fingerprint
        fp = self.compute_fp_from_coords(features_coords, raw=raw)

        return fp

    def _init_feature_factory(self):
        """Init the feature factory."""
        self.feature_factory = get_feature_factory(self.factory)

    def get_features(self, mol: dm.Mol, conformer_id: int = -1) -> pd.DataFrame:
        """Retrieve the features for a given molecule.

        Args:
            mol: the molecule of interest

        Returns:
            features: the features as a Numpy array
        """
        features_data = []

        # Extract the features for this molecule
        features = self.feature_factory.GetFeaturesForMol(mol, confId=conformer_id)

        # Extract all the feature atom indices for this molecule
        for feature in features:
            datum = {}
            datum["feature_id"] = feature.GetId()
            datum["feature_name"] = feature.GetFamily()
            datum["feature_type"] = feature.GetType()
            datum["atom_indices"] = feature.GetAtomIds()
            datum["coords"] = np.array(feature.GetPos())

            features_data.append(datum)

        features_data = pd.DataFrame(features_data)

        return features_data

    def get_features_from_many(
        self,
        mols: List[dm.Mol],
        align: bool = True,
        conformer_id: int = -1,
        copy: bool = True,
        keep_mols: bool = False,
    ):
        """Extract all the features from a list of molecules after an optional
        alignement step.

        Args:
            mols: List of molecules with conformers.
            align: Whether to align the conformers of the molecules.
            conformer_id: Optional conformer id.
            copy: Whether to copy the molecules before clustering.
            keep_mols: Whether to keep the molecules in the returned dataframe.
        """

        if not all([mol.GetNumConformers() >= 1 for mol in mols]):
            raise ValueError("One or more input molecules is missing a conformer.")

        # Make a copy of the molecules since they are going to be modified
        if copy:
            mols = [dm.copy_mol(mol) for mol in mols]

        # Align the conformers
        if align:
            mols, _ = commons.align_conformers(mols, copy=False, conformer_id=conformer_id)

        all_features = pd.DataFrame()

        for i, mol in enumerate(mols):
            features = self.get_features(mol)
            features["mol_index"] = i

            if keep_mols:
                features["mol"] = mol

            all_features = pd.concat([all_features, features], ignore_index=True)

        return all_features

    def compute_fp_from_coords(
        self,
        features_coords: List[Tuple[str, Tuple[float]]],
        raw: bool = False,
    ):
        """Compute a fingerprint from a list of features.

        Args:
            features_coords: Features coords: `[('A', (1.23, 2.34, 3.45)), ('A', (4.56, 5.67, 6.78)), ...]`.
            raw: Whether to return the raw fingerprint or a Numpy array.
        """

        # Init the pmapper engine
        ph_engine = Pharm(bin_step=self.bin_step)
        # Convert coords to list in case those are arrays
        features_coords = [(name, tuple(coords)) for name, coords in features_coords]
        # Load pharmacophore points
        ph_engine.load_from_feature_coords(features_coords)
        # Init the iterator over the pharmacophore points
        points_iterator = ph_engine.iterate_pharm(
            min_features=self.min_features,
            max_features=self.max_features,
            tol=self.tolerance,
            return_feature_ids=False,
        )

        # Compute the fingerprint
        on_bits = set()
        for h in points_iterator:
            if self.use_modulo:
                on_bits.add(int(h, 16) % self.length)  # type: ignore
            else:
                random.seed(int(h, 16))  # type: ignore
                on_bits.add(random.randrange(self.length))

        if raw:
            return np.array(on_bits)

        fp = np.zeros(self.length, dtype=int)
        fp[list(on_bits)] = 1

        return fp

    def cluster_features(
        self,
        features: pd.DataFrame,
        min_samples_ratio: float = 0.5,
        n_mols: int = None,
        eps: float = np.inf,
        **kwargs,
    ):
        """Cluster a set of pharmacophoric features using OPTICS.
        The only reason why we are not using SpectralClustering is because of the need to provide
        the number of clusters.

        Args:
            features: A dataframe of features.
            min_samples_ratio: Percentages of mols that must contain a pharmacophoric point
                to be considered as a core point.
            n_mols: Optional number of compounds to compute `min_samples` from the
                `min_samples_ratio` value. If not set it will use `mol_index` from
                the `features` dataframe.
            eps: The maximum distance between two samples for one to be considered as
                in the neighborhood of the other. This is max_eps in OPTICS
            kwargs: Any additional parameters to pass to `sklearn.cluster.OPTICS`.
        """

        if n_mols is None:
            n_mols = len(features["mol_index"].unique())

        # Compute min_samples
        min_samples = max(int(round(min_samples_ratio * n_mols, 0)), 1)
        clusters = []
        feature_id = 0
        for _, rows in features.groupby("feature_name"):
            feature_name = rows.iloc[0]["feature_name"]
            if min_samples > rows.shape[0]:
                logger.info(
                    f"Feature {feature_name} does not have enough molecule ({len(rows)}), skipping"
                )
                continue
            coords = np.vstack(rows["coords"].values)

            # Init clustering
            optics = OPTICS(min_samples=min_samples, max_eps=eps, **kwargs)
            optics = optics.fit(coords)
            labels = optics.labels_
            # a node that is not a core would basically be a node that cannot be labeled
            # thus border nodes are considered core
            core_samples_mask = np.zeros_like(labels, dtype=bool)
            core_samples_mask[labels == 1] = True

            # Find the centroids (consensus points)
            unique_labels = set(labels)
            for k in unique_labels:
                if k == -1:
                    continue
                class_member_mask = labels == k
                cluster_coords = coords[class_member_mask & core_samples_mask]
                if len(cluster_coords) == 0:
                    continue
                cluster_centroid = cluster_coords.mean(axis=0)

                cluster = {}
                cluster["feature_id"] = feature_id
                cluster["feature_name"] = feature_name
                cluster["coords"] = cluster_centroid
                cluster["cluster_size"] = len(cluster_coords)

                clusters.append(cluster)
                feature_id += 1

        clusters = pd.DataFrame(clusters)

        return clusters

    ## Viz methods

    def show(
        self,
        mol: dm.Mol,
        features: pd.DataFrame = None,
        alpha: float = 1.0,
        sphere_radius: float = 0.4,
        show_legend: bool = True,
    ):
        """Show a 3D view of a given molecule with the pharmacophoric features.

        Args:
            mol: the molecule of interest
            alpha: Alpha value for the colors (currently not working).
            sphere_radius: Radius of the spheres for the features.
            show_legend: Display the legend (the layout is bad but at least it
                shows the legend).
        """

        if features is None:
            features = self.get_features(mol)

        return viz.show_pharm_features(
            mol,
            features=features,
            feature_factory=self.feature_factory,
            alpha=alpha,
            sphere_radius=sphere_radius,
            show_legend=show_legend,
        )

    def show_many(
        self,
        mols: List[dm.Mol],
        align: bool = True,
        conformer_id: int = -1,
        copy: bool = True,
        min_samples_ratio: float = 0.5,
        eps: float = 2,
        alpha: float = 1.0,
        sphere_radius: float = 0.4,
        show_legend: bool = True,
    ):
        """Show a 3D view of a given molecule with the pharmacophoric features.

        Args:
            mols: a list of molecules.
            align: Whether to align the conformers of the molecules.
            conformer_id: Optional conformer id.
            copy: Whether to copy the molecules before clustering.
            min_samples_ratio: Percentages of mols that must contain a pharmacophoric point
                to be considered as a core point.
            eps: The maximum distance between two samples for one to be considered as
                in the neighborhood of the other.
            alpha: Alpha value for the colors (currently not working).
            sphere_radius: Radius of the spheres for the features.
            show_legend: Display the legend (the layout is bad but at least it
                shows the legend).
        """

        # Get all the features
        features = self.get_features_from_many(
            mols,
            keep_mols=True,
            align=align,
            conformer_id=conformer_id,
            copy=copy,
        )

        # Retrieve the aligned molecules
        mols = features.groupby("mol_index").first()["mol"].tolist()

        # Cluster the features
        clustered_features = self.cluster_features(
            features,
            min_samples_ratio=min_samples_ratio,
            eps=eps,
        )

        return viz.show_pharm_features(
            mols,
            features=clustered_features,
            feature_factory=self.feature_factory,
            alpha=alpha,
            sphere_radius=sphere_radius,
            show_legend=show_legend,
        )

    def __getstate__(self):
        """Serialize the class for pickling."""
        state = {}
        state["factory"] = self.factory
        state["length"] = self.length
        state["bin_step"] = self.bin_step
        state["min_features"] = self.min_features
        state["max_features"] = self.max_features
        state["use_modulo"] = self.use_modulo
        state["tolerance"] = self.tolerance
        return state

    def __setstate__(self, state: dict):
        """Reload the class from pickling."""
        self.__dict__.update(state)
        self._init_feature_factory()

__call__(mol, conformer_id=-1, raw=False)

Compute the Pharmacophore fingeprint for the input molecule.

Parameters:

Name Type Description Default
mol Union[Mol, str]

the molecule of interest

required
conformer_id int

the conformer id to use.

-1
raw bool

Whether to return the raw fingerprint or a Numpy array.

False

Returns:

Name Type Description
fp

the computed fingerprint as a Numpy array.

Source code in molfeat/calc/pharmacophore.py
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
def __call__(self, mol: Union[dm.Mol, str], conformer_id: int = -1, raw: bool = False):
    """Compute the Pharmacophore fingeprint for the input molecule.

    Args:
        mol: the molecule of interest
        conformer_id: the conformer id to use.
        raw: Whether to return the raw fingerprint or a Numpy array.

    Returns:
        fp: the computed fingerprint as a Numpy array.
    """

    # Get a molecule
    mol = dm.to_mol(mol)

    if mol is None:
        raise ValueError("The input molecule is not valid.")

    if mol.GetNumConformers() < 1:  # type: ignore
        raise ValueError("Expected a molecule with conformers information.")

    # Get the features for the mol
    features = self.get_features(mol, conformer_id=conformer_id)

    # Convert features dataframe to coordinates
    if features.empty:
        features_coords = []
    else:
        features_coords = features[["feature_name", "coords"]].values.tolist()

    # Compute the fingerprint
    fp = self.compute_fp_from_coords(features_coords, raw=raw)

    return fp

__getstate__()

Serialize the class for pickling.

Source code in molfeat/calc/pharmacophore.py
609
610
611
612
613
614
615
616
617
618
619
def __getstate__(self):
    """Serialize the class for pickling."""
    state = {}
    state["factory"] = self.factory
    state["length"] = self.length
    state["bin_step"] = self.bin_step
    state["min_features"] = self.min_features
    state["max_features"] = self.max_features
    state["use_modulo"] = self.use_modulo
    state["tolerance"] = self.tolerance
    return state

__init__(factory='pmapper', length=2048, bin_step=1, min_features=2, max_features=3, use_modulo=True, tolerance=0)

Pharmacophore computation.

Parameters:

Name Type Description Default
factory Union[str, MolChemicalFeatureFactory]

Which features factory to use. One of "default", "cats", "gobbi" , "pmapper" or path to a feature definition or a feature factory object

'pmapper'
length int

Optional desired length. If provided, the fp will be refold or padded to that length. If set to None, fallback to the default for the provided sig factory.

2048
bin_step float

Bin step to use.

1
min_features int

Minimum number of features to use.

2
max_features int

Maximum number of features to use.

3
use_modulo bool

whether to use modulo to compute the pharmacophore fingerprint

True
tolerance float

tolerance value to use when computing the pharmacophore fingerprint

0
Source code in molfeat/calc/pharmacophore.py
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
def __init__(
    self,
    factory: Union[str, MolChemicalFeatureFactory] = "pmapper",
    length: int = 2048,
    bin_step: float = 1,
    min_features: int = 2,
    max_features: int = 3,
    use_modulo: bool = True,
    tolerance: float = 0,
):
    """Pharmacophore computation.

    Args:
        factory: Which features factory to use. One of "default", "cats", "gobbi" , "pmapper" or path
            to a feature definition or a feature factory object
        length: Optional desired length. If provided, the fp will be refold or padded to that length.
            If set to None, fallback to the default for the provided sig factory.
        bin_step: Bin step to use.
        min_features: Minimum number of features to use.
        max_features: Maximum number of features to use.
        use_modulo: whether to use modulo to compute the pharmacophore fingerprint
        tolerance: tolerance value to use when computing the pharmacophore fingerprint
    """

    self.factory = factory
    self.length = length
    self.bin_step = bin_step
    self.min_features = min_features
    self.max_features = max_features
    self.use_modulo = use_modulo
    self.tolerance = tolerance

    self._init_feature_factory()

__setstate__(state)

Reload the class from pickling.

Source code in molfeat/calc/pharmacophore.py
621
622
623
624
def __setstate__(self, state: dict):
    """Reload the class from pickling."""
    self.__dict__.update(state)
    self._init_feature_factory()

cluster_features(features, min_samples_ratio=0.5, n_mols=None, eps=np.inf, **kwargs)

Cluster a set of pharmacophoric features using OPTICS. The only reason why we are not using SpectralClustering is because of the need to provide the number of clusters.

Parameters:

Name Type Description Default
features DataFrame

A dataframe of features.

required
min_samples_ratio float

Percentages of mols that must contain a pharmacophoric point to be considered as a core point.

0.5
n_mols int

Optional number of compounds to compute min_samples from the min_samples_ratio value. If not set it will use mol_index from the features dataframe.

None
eps float

The maximum distance between two samples for one to be considered as in the neighborhood of the other. This is max_eps in OPTICS

inf
kwargs

Any additional parameters to pass to sklearn.cluster.OPTICS.

{}
Source code in molfeat/calc/pharmacophore.py
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
def cluster_features(
    self,
    features: pd.DataFrame,
    min_samples_ratio: float = 0.5,
    n_mols: int = None,
    eps: float = np.inf,
    **kwargs,
):
    """Cluster a set of pharmacophoric features using OPTICS.
    The only reason why we are not using SpectralClustering is because of the need to provide
    the number of clusters.

    Args:
        features: A dataframe of features.
        min_samples_ratio: Percentages of mols that must contain a pharmacophoric point
            to be considered as a core point.
        n_mols: Optional number of compounds to compute `min_samples` from the
            `min_samples_ratio` value. If not set it will use `mol_index` from
            the `features` dataframe.
        eps: The maximum distance between two samples for one to be considered as
            in the neighborhood of the other. This is max_eps in OPTICS
        kwargs: Any additional parameters to pass to `sklearn.cluster.OPTICS`.
    """

    if n_mols is None:
        n_mols = len(features["mol_index"].unique())

    # Compute min_samples
    min_samples = max(int(round(min_samples_ratio * n_mols, 0)), 1)
    clusters = []
    feature_id = 0
    for _, rows in features.groupby("feature_name"):
        feature_name = rows.iloc[0]["feature_name"]
        if min_samples > rows.shape[0]:
            logger.info(
                f"Feature {feature_name} does not have enough molecule ({len(rows)}), skipping"
            )
            continue
        coords = np.vstack(rows["coords"].values)

        # Init clustering
        optics = OPTICS(min_samples=min_samples, max_eps=eps, **kwargs)
        optics = optics.fit(coords)
        labels = optics.labels_
        # a node that is not a core would basically be a node that cannot be labeled
        # thus border nodes are considered core
        core_samples_mask = np.zeros_like(labels, dtype=bool)
        core_samples_mask[labels == 1] = True

        # Find the centroids (consensus points)
        unique_labels = set(labels)
        for k in unique_labels:
            if k == -1:
                continue
            class_member_mask = labels == k
            cluster_coords = coords[class_member_mask & core_samples_mask]
            if len(cluster_coords) == 0:
                continue
            cluster_centroid = cluster_coords.mean(axis=0)

            cluster = {}
            cluster["feature_id"] = feature_id
            cluster["feature_name"] = feature_name
            cluster["coords"] = cluster_centroid
            cluster["cluster_size"] = len(cluster_coords)

            clusters.append(cluster)
            feature_id += 1

    clusters = pd.DataFrame(clusters)

    return clusters

compute_fp_from_coords(features_coords, raw=False)

Compute a fingerprint from a list of features.

Parameters:

Name Type Description Default
features_coords List[Tuple[str, Tuple[float]]]

Features coords: [('A', (1.23, 2.34, 3.45)), ('A', (4.56, 5.67, 6.78)), ...].

required
raw bool

Whether to return the raw fingerprint or a Numpy array.

False
Source code in molfeat/calc/pharmacophore.py
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
def compute_fp_from_coords(
    self,
    features_coords: List[Tuple[str, Tuple[float]]],
    raw: bool = False,
):
    """Compute a fingerprint from a list of features.

    Args:
        features_coords: Features coords: `[('A', (1.23, 2.34, 3.45)), ('A', (4.56, 5.67, 6.78)), ...]`.
        raw: Whether to return the raw fingerprint or a Numpy array.
    """

    # Init the pmapper engine
    ph_engine = Pharm(bin_step=self.bin_step)
    # Convert coords to list in case those are arrays
    features_coords = [(name, tuple(coords)) for name, coords in features_coords]
    # Load pharmacophore points
    ph_engine.load_from_feature_coords(features_coords)
    # Init the iterator over the pharmacophore points
    points_iterator = ph_engine.iterate_pharm(
        min_features=self.min_features,
        max_features=self.max_features,
        tol=self.tolerance,
        return_feature_ids=False,
    )

    # Compute the fingerprint
    on_bits = set()
    for h in points_iterator:
        if self.use_modulo:
            on_bits.add(int(h, 16) % self.length)  # type: ignore
        else:
            random.seed(int(h, 16))  # type: ignore
            on_bits.add(random.randrange(self.length))

    if raw:
        return np.array(on_bits)

    fp = np.zeros(self.length, dtype=int)
    fp[list(on_bits)] = 1

    return fp

consensus_fp(mols, align=True, conformer_id=-1, copy=True, min_samples_ratio=0.5, eps=2, raw=False, **cluster_kwargs)

Compute a consensus fingerprint from a list of molecules.

Parameters:

Name Type Description Default
mols List[Mol]

a list of molecules.

required
align bool

Whether to align the conformers of the molecules.

True
conformer_id int

Optional conformer id.

-1
copy bool

Whether to copy the molecules before clustering.

True
min_samples_ratio float

Percentages of mols that must contain a pharmacophoric point to be considered as a core point.

0.5
eps float

The maximum distance between two samples for one to be considered as in the neighborhood of the other.

2
raw bool

Whether to return the raw fingerprint or a Numpy array.

False
cluster_kwargs

additional keyword arguments for the clustering algorithm.

{}
Source code in molfeat/calc/pharmacophore.py
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
def consensus_fp(
    self,
    mols: List[dm.Mol],
    align: bool = True,
    conformer_id: int = -1,
    copy: bool = True,
    min_samples_ratio: float = 0.5,
    eps: float = 2,
    raw: bool = False,
    **cluster_kwargs,
):
    """Compute a consensus fingerprint from a list of molecules.

    Args:
        mols: a list of molecules.
        align: Whether to align the conformers of the molecules.
        conformer_id: Optional conformer id.
        copy: Whether to copy the molecules before clustering.
        min_samples_ratio: Percentages of mols that must contain a pharmacophoric point
            to be considered as a core point.
        eps: The maximum distance between two samples for one to be considered as
            in the neighborhood of the other.
        raw: Whether to return the raw fingerprint or a Numpy array.
        cluster_kwargs: additional keyword arguments for the clustering algorithm.
    """

    # Get all the features
    features = self.get_features_from_many(
        mols,
        keep_mols=True,
        align=align,
        conformer_id=conformer_id,
        copy=copy,
    )

    # Retrieve the aligned molecules
    mols = features.groupby("mol_index").first()["mol"].tolist()
    # Cluster the features
    clustered_features = self.cluster_features(
        features, min_samples_ratio=min_samples_ratio, eps=eps, **cluster_kwargs
    )
    # Convert features dataframe to coordinates
    if clustered_features.empty:
        features_coords = []
    else:
        features_coords = clustered_features[["feature_name", "coords"]].values.tolist()
    # Compute the fingerprint
    fp = self.compute_fp_from_coords(features_coords, raw=raw)

    return fp

get_features(mol, conformer_id=-1)

Retrieve the features for a given molecule.

Parameters:

Name Type Description Default
mol Mol

the molecule of interest

required

Returns:

Name Type Description
features DataFrame

the features as a Numpy array

Source code in molfeat/calc/pharmacophore.py
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
def get_features(self, mol: dm.Mol, conformer_id: int = -1) -> pd.DataFrame:
    """Retrieve the features for a given molecule.

    Args:
        mol: the molecule of interest

    Returns:
        features: the features as a Numpy array
    """
    features_data = []

    # Extract the features for this molecule
    features = self.feature_factory.GetFeaturesForMol(mol, confId=conformer_id)

    # Extract all the feature atom indices for this molecule
    for feature in features:
        datum = {}
        datum["feature_id"] = feature.GetId()
        datum["feature_name"] = feature.GetFamily()
        datum["feature_type"] = feature.GetType()
        datum["atom_indices"] = feature.GetAtomIds()
        datum["coords"] = np.array(feature.GetPos())

        features_data.append(datum)

    features_data = pd.DataFrame(features_data)

    return features_data

get_features_from_many(mols, align=True, conformer_id=-1, copy=True, keep_mols=False)

Extract all the features from a list of molecules after an optional alignement step.

Parameters:

Name Type Description Default
mols List[Mol]

List of molecules with conformers.

required
align bool

Whether to align the conformers of the molecules.

True
conformer_id int

Optional conformer id.

-1
copy bool

Whether to copy the molecules before clustering.

True
keep_mols bool

Whether to keep the molecules in the returned dataframe.

False
Source code in molfeat/calc/pharmacophore.py
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
def get_features_from_many(
    self,
    mols: List[dm.Mol],
    align: bool = True,
    conformer_id: int = -1,
    copy: bool = True,
    keep_mols: bool = False,
):
    """Extract all the features from a list of molecules after an optional
    alignement step.

    Args:
        mols: List of molecules with conformers.
        align: Whether to align the conformers of the molecules.
        conformer_id: Optional conformer id.
        copy: Whether to copy the molecules before clustering.
        keep_mols: Whether to keep the molecules in the returned dataframe.
    """

    if not all([mol.GetNumConformers() >= 1 for mol in mols]):
        raise ValueError("One or more input molecules is missing a conformer.")

    # Make a copy of the molecules since they are going to be modified
    if copy:
        mols = [dm.copy_mol(mol) for mol in mols]

    # Align the conformers
    if align:
        mols, _ = commons.align_conformers(mols, copy=False, conformer_id=conformer_id)

    all_features = pd.DataFrame()

    for i, mol in enumerate(mols):
        features = self.get_features(mol)
        features["mol_index"] = i

        if keep_mols:
            features["mol"] = mol

        all_features = pd.concat([all_features, features], ignore_index=True)

    return all_features

show(mol, features=None, alpha=1.0, sphere_radius=0.4, show_legend=True)

Show a 3D view of a given molecule with the pharmacophoric features.

Parameters:

Name Type Description Default
mol Mol

the molecule of interest

required
alpha float

Alpha value for the colors (currently not working).

1.0
sphere_radius float

Radius of the spheres for the features.

0.4
show_legend bool

Display the legend (the layout is bad but at least it shows the legend).

True
Source code in molfeat/calc/pharmacophore.py
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
def show(
    self,
    mol: dm.Mol,
    features: pd.DataFrame = None,
    alpha: float = 1.0,
    sphere_radius: float = 0.4,
    show_legend: bool = True,
):
    """Show a 3D view of a given molecule with the pharmacophoric features.

    Args:
        mol: the molecule of interest
        alpha: Alpha value for the colors (currently not working).
        sphere_radius: Radius of the spheres for the features.
        show_legend: Display the legend (the layout is bad but at least it
            shows the legend).
    """

    if features is None:
        features = self.get_features(mol)

    return viz.show_pharm_features(
        mol,
        features=features,
        feature_factory=self.feature_factory,
        alpha=alpha,
        sphere_radius=sphere_radius,
        show_legend=show_legend,
    )

show_many(mols, align=True, conformer_id=-1, copy=True, min_samples_ratio=0.5, eps=2, alpha=1.0, sphere_radius=0.4, show_legend=True)

Show a 3D view of a given molecule with the pharmacophoric features.

Parameters:

Name Type Description Default
mols List[Mol]

a list of molecules.

required
align bool

Whether to align the conformers of the molecules.

True
conformer_id int

Optional conformer id.

-1
copy bool

Whether to copy the molecules before clustering.

True
min_samples_ratio float

Percentages of mols that must contain a pharmacophoric point to be considered as a core point.

0.5
eps float

The maximum distance between two samples for one to be considered as in the neighborhood of the other.

2
alpha float

Alpha value for the colors (currently not working).

1.0
sphere_radius float

Radius of the spheres for the features.

0.4
show_legend bool

Display the legend (the layout is bad but at least it shows the legend).

True
Source code in molfeat/calc/pharmacophore.py
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
def show_many(
    self,
    mols: List[dm.Mol],
    align: bool = True,
    conformer_id: int = -1,
    copy: bool = True,
    min_samples_ratio: float = 0.5,
    eps: float = 2,
    alpha: float = 1.0,
    sphere_radius: float = 0.4,
    show_legend: bool = True,
):
    """Show a 3D view of a given molecule with the pharmacophoric features.

    Args:
        mols: a list of molecules.
        align: Whether to align the conformers of the molecules.
        conformer_id: Optional conformer id.
        copy: Whether to copy the molecules before clustering.
        min_samples_ratio: Percentages of mols that must contain a pharmacophoric point
            to be considered as a core point.
        eps: The maximum distance between two samples for one to be considered as
            in the neighborhood of the other.
        alpha: Alpha value for the colors (currently not working).
        sphere_radius: Radius of the spheres for the features.
        show_legend: Display the legend (the layout is bad but at least it
            shows the legend).
    """

    # Get all the features
    features = self.get_features_from_many(
        mols,
        keep_mols=True,
        align=align,
        conformer_id=conformer_id,
        copy=copy,
    )

    # Retrieve the aligned molecules
    mols = features.groupby("mol_index").first()["mol"].tolist()

    # Cluster the features
    clustered_features = self.cluster_features(
        features,
        min_samples_ratio=min_samples_ratio,
        eps=eps,
    )

    return viz.show_pharm_features(
        mols,
        features=clustered_features,
        feature_factory=self.feature_factory,
        alpha=alpha,
        sphere_radius=sphere_radius,
        show_legend=show_legend,
    )

get_feature_factory(factory)

Build a feature factory.

Source code in molfeat/calc/pharmacophore.py
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
def get_feature_factory(
    factory: Union[str, MolChemicalFeatureFactory]
) -> MolChemicalFeatureFactory:
    """Build a feature factory."""

    if isinstance(factory, MolChemicalFeatureFactory):
        feature_factory = factory

    elif factory == "pmapper":
        with pkg_resources.path("pmapper", "smarts_features.fdef") as fdef_name:
            feature_factory = ChemicalFeatures.BuildFeatureFactory(str(fdef_name))  # type: ignore

    elif factory == "gobbi":
        feature_factory = Gobbi_Pharm2D.factory.featFactory

    elif factory == "cats":
        with pkg_resources.open_text("molfeat.data", "cats_features.fdef") as instream:
            feature_factory = ChemicalFeatures.BuildFeatureFactoryFromString(instream.read())  # type: ignore

    elif factory == "default":
        # Load default feature definition file
        fdefFile = os.path.join(RDConfig.RDDataDir, "BaseFeatures.fdef")
        feature_factory = ChemicalFeatures.BuildFeatureFactory(fdefFile)  # type: ignore

    elif dm.fs.exists(factory):
        with fsspec.open(factory, "r") as instream:
            fdef = instream.read()
            feature_factory = ChemicalFeatures.BuildFeatureFactoryFromString(fdef)  # type: ignore

    else:
        raise ValueError(f"The factory '{factory}' is not supported.")

    return feature_factory

get_sig_factory(factory, useCounts=None, minPointCount=None, maxPointCount=None, shortestPathsOnly=None, includeBondOrder=None, skipFeats=None, trianglePruneBins=None, bins=None, init_factory=True)

Build a signature factory.

Source code in molfeat/calc/pharmacophore.py
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
def get_sig_factory(
    factory: Union[str, MolChemicalFeatureFactory],
    useCounts: bool = None,
    minPointCount: int = None,
    maxPointCount: int = None,
    shortestPathsOnly: bool = None,
    includeBondOrder: bool = None,
    skipFeats: List[str] = None,
    trianglePruneBins: bool = None,
    bins: List[Tuple[int, int]] = None,
    init_factory: bool = True,
):
    """Build a signature factory."""

    # Get feature factory
    feature_factory = get_feature_factory(factory)

    # Get default params and override them as needed
    params, bins = get_sig_factory_params(
        factory,
        useCounts=useCounts,
        minPointCount=minPointCount,
        maxPointCount=maxPointCount,
        shortestPathsOnly=shortestPathsOnly,
        includeBondOrder=includeBondOrder,
        skipFeats=skipFeats,
        trianglePruneBins=trianglePruneBins,
        bins=bins,
    )

    # Build signature factory
    sig_factory = SigFactory(feature_factory, **params)

    # Set bins
    sig_factory.SetBins(bins)

    # Init the factory
    if init_factory:
        sig_factory.Init()

    return sig_factory

get_sig_factory_params(factory_name, useCounts=None, minPointCount=None, maxPointCount=None, shortestPathsOnly=None, includeBondOrder=None, skipFeats=None, trianglePruneBins=None, bins=None)

Get the default parameter for a given sig factory allowing some of them to be overriden.

Parameters:

Name Type Description Default
factory_name str

The name of the factory.

required
Source code in molfeat/calc/pharmacophore.py
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
def get_sig_factory_params(
    factory_name: str,
    useCounts: bool = None,
    minPointCount: int = None,
    maxPointCount: int = None,
    shortestPathsOnly: bool = None,
    includeBondOrder: bool = None,
    skipFeats: List[str] = None,
    trianglePruneBins: bool = None,
    bins: List[Tuple[int, int]] = None,
) -> Tuple[Dict[str, Any], list]:
    """Get the default parameter for a given sig factory allowing some of them to be overriden.

    Args:
        factory_name: The name of the factory.
    """

    # Get default params.

    if factory_name == "cats":
        default_bins = [
            (0, 1),
            (1, 2),
            (2, 3),
            (3, 4),
            (4, 5),
            (5, 6),
            (6, 7),
            (7, 8),
            (8, 9),
        ]
        params = dict(
            useCounts=True,
            minPointCount=2,
            maxPointCount=2,
            trianglePruneBins=True,
            shortestPathsOnly=True,
            includeBondOrder=False,
        )

    elif factory_name == "gobbi":
        default_bins = [(2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 8), (8, 100)]
        params = dict(
            useCounts=False,
            minPointCount=2,
            maxPointCount=3,
            trianglePruneBins=True,
            shortestPathsOnly=True,
            includeBondOrder=False,
        )

    elif factory_name == "pmapper":
        default_bins = [(2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 8), (8, 100)]
        params = dict(
            useCounts=False,
            minPointCount=2,
            maxPointCount=3,
            trianglePruneBins=False,
            shortestPathsOnly=True,
            includeBondOrder=False,
        )

    elif factory_name == "default":
        params = dict(
            useCounts=False,
            minPointCount=2,
            maxPointCount=3,
            trianglePruneBins=False,
            shortestPathsOnly=True,
            skipFeats=["ZnBinder", "LumpedHydrophobe"],
            includeBondOrder=False,
        )
        default_bins = [(2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 8), (8, 100)]

    else:
        raise ValueError(f"Default values for {factory_name} are not known.")

    # Override default params when set.

    if minPointCount is not None:
        params["minPointCount"] = minPointCount

    if maxPointCount is not None:
        params["maxPointCount"] = maxPointCount

    if trianglePruneBins is not None:
        params["trianglePruneBins"] = trianglePruneBins

    if includeBondOrder is not None:
        params["includeBondOrder"] = includeBondOrder

    if useCounts is not None:
        params["useCounts"] = useCounts

    if skipFeats is not None:
        params["skipFeats"] = skipFeats  # type: ignore

    if shortestPathsOnly is not None:
        params["shortestPathsOnly"] = shortestPathsOnly

    bins = bins or default_bins

    return params, bins

Scaffold Keys

ScaffoldKeyCalculator

Bases: SerializableCalculator

Implementation of the Scaffold Keys described in Identification of Bioisosteric Scaffolds using Scaffold Keys by Peter Ertl

Source code in molfeat/calc/skeys.py
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
class ScaffoldKeyCalculator(SerializableCalculator):
    """
    Implementation of the Scaffold Keys described in
    `Identification of Bioisosteric Scaffolds using Scaffold Keys` by Peter Ertl
    """

    DESCRIPTORS = [
        "n_atom_in_rings",
        "n_atom_in_conjugated_ring",
        "n_atoms_not_in_conjugated_ring",
        "n_atom_in_chain",
        "n_atom_exocyclic",
        "n_nitrogen",
        "n_nitrogen_in_ring",
        "n_oxygen",
        "n_oxygen_in_ring",
        "n_sulfur",
        "n_heteroatoms",
        "n_heteroatoms_in_ring",
        "n_atom_spiro_atoms",
        "n_heteroatom_more_than_2_conn",
        "n_carbon_atleast_2_heteroatoms",
        "n_atom_at_least_2_nei_more_than_2_conn",
        "abs_scaffold_format_charge",
        "n_bonds",
        "n_multiple_non_conj_ring_bonds",
        "n_bonds_2_heteroatoms",
        "n_carbon_het_carbon_het_bonds",
        "n_bonds_at_least_3_conn",
        "n_exocyclic_single_bonds_carbon",
        "n_exocyclic_single_bonds_nitrogen",
        "n_non_ring_bonds_2_conj_rings",
        "n_non_ring_bonds_conj_nonconj_rings",
        "n_bonds_atoms_with_at_least_one_nei_with_2_conn",
        "n_simple_rings",
        "size_largest_ring",
        "n_simple_rings_no_heteroatoms",
        "n_simple_rings_1_heteroatoms",
        "n_simple_rings_2_heteroatoms",
        "n_simple_rings_at_least_3_heteroatoms",
        "n_simple_non_conj_5_atoms_rings",
        "n_simple_non_conj_6_atoms_rings",
        "n_ring_system",
        "n_ring_system_with_2_non_conj_simple_ring",
        "n_ring_system_with_2_conj_simple_ring",
        "n_ring_system_with_conj_non_conj_simple_ring",
        "n_ring_system_with_3_conj_simple_ring",
        "n_ring_system_with_3_non_conj_simple_ring",
        "n_ring_system_with_greater_one_conj_nonconj_simple_ring",
    ]

    NORM_PARAMS = pd.read_csv(
        Path(molfeat.__file__).parents[0].joinpath("data/skey_parameters.csv"),
        index_col=0,
    ).loc[DESCRIPTORS]

    def __init__(
        self, normalize: bool = False, verbose: bool = False, use_scaffold: bool = False, **kwargs
    ):
        """
        Init of the scaffold key function

        Args:
            normalize: whether to normalize the value of the feature
            verbose: whether to log errors
            use_scaffold: whether to convert the molecule into scaffold first
        """
        self.normalize = normalize
        self.verbose = verbose
        self.use_scaffold = use_scaffold

    def __getstate__(self):
        """Get state of the scaffold key function"""
        state = {}
        state["normalize"] = self.normalize
        state["verbose"] = self.verbose
        state["use_scaffold"] = self.use_scaffold
        return state

    def __len__(self):
        return len(self.DESCRIPTORS)

    @classmethod
    def compute_normalization(cls, features: np.ndarray):
        """Normalize input features. The normalization parameters are
        computed by the scaffolds of 2.1M molecules from CHEMBL 29.
        """
        return (features - cls.NORM_PARAMS["mean"]) / cls.NORM_PARAMS["std"]

    def n_atom_in_rings(self, mol: dm.Mol):
        """1. number of ring atoms"""
        sm = dm.from_smarts("[r]")
        return len(mol.GetSubstructMatches(sm, uniquify=True))

    def n_atom_in_conjugated_ring(self, mol: dm.Mol):
        """2. number of atoms in conjugated rings"""
        ri = mol.GetRingInfo()
        n = 0
        for ring in ri.AtomRings():
            if _is_ring_fully_conjugated(mol, ring):
                n += len(ring)
        return n

    def n_atoms_not_in_conjugated_ring(self, mol: dm.Mol):
        """
        3. number of atoms not in conjugated rings
        (i.e. atoms in aliphatic rings and non-ring atoms)
        """
        # EN: replace conjugation by aromatic
        ri = mol.GetRingInfo()
        n = 0
        for ring in ri.AtomRings():
            if not _is_ring_fully_conjugated(mol, ring):
                n += len(ring)
        return n

    def n_atom_in_chain(self, mol: dm.Mol):
        """4. number atoms in chains (not counting double-connected exo-chain atoms)"""
        sm = dm.from_smarts("[!r;!$(*=[r])]")
        return len(mol.GetSubstructMatches(sm, uniquify=True))

    def n_atom_exocyclic(self, mol: dm.Mol):
        """5. number of exocyclic atoms (connected by multiple bonds to a ring)"""
        sm = dm.from_smarts("[!r;!$(*-[r])&$(*~[r])]")
        return len(mol.GetSubstructMatches(sm, uniquify=True))

    def n_nitrogen(self, mol: dm.Mol):
        """6. number of nitrogen"""
        sm = dm.from_smarts("[#7]")
        return len(mol.GetSubstructMatches(sm, uniquify=True))

    def n_nitrogen_in_ring(self, mol: dm.Mol):
        """7. number of nitrogen in rings"""
        sm = dm.from_smarts("[#7;r]")
        return len(mol.GetSubstructMatches(sm, uniquify=True))

    def n_oxygen(self, mol: dm.Mol):
        """8. number of oxygen"""
        sm = dm.from_smarts("[#8]")
        return len(mol.GetSubstructMatches(sm, uniquify=True))

    def n_oxygen_in_ring(self, mol: dm.Mol):
        """9. number of oxygen in rings"""
        sm = dm.from_smarts("[#8]")
        return len(mol.GetSubstructMatches(sm, uniquify=True))

    def n_sulfur(self, mol: dm.Mol):
        """10. number of sulfur atoms"""
        sm = dm.from_smarts("[#16]")
        return len(mol.GetSubstructMatches(sm, uniquify=True))

    def n_heteroatoms(self, mol: dm.Mol):
        """11. number of heteroatoms"""

        sm = dm.from_smarts("[!#1&!#6]")
        return len(mol.GetSubstructMatches(sm, uniquify=True))

    def n_heteroatoms_in_ring(self, mol: dm.Mol):
        """12. number of heteroatoms in rings"""
        sm = dm.from_smarts("[!#1&!#6&r]")
        return len(mol.GetSubstructMatches(sm, uniquify=True))

    def n_atom_spiro_atoms(self, mol: dm.Mol):
        """13. number of spiro atoms"""
        return Desc.CalcNumSpiroAtoms(mol)

    def n_heteroatom_more_than_2_conn(self, mol: dm.Mol):
        """14. number of heteroatoms with more than 2 connections"""
        sm = dm.from_smarts("[!#1;!#6;!D1!D0;!D2]")
        return len(mol.GetSubstructMatches(sm, uniquify=True))

    def n_carbon_atleast_2_heteroatoms(self, mol: dm.Mol):
        """15. number of carbon atoms connected to at least 2 heteroatoms"""
        n_atoms = 0
        for atom in mol.GetAtoms():
            tmp = [x for x in atom.GetNeighbors() if x.GetAtomicNum() not in [1, 6]]
            n_atoms += len(tmp) >= 2
        return n_atoms

    def n_atom_at_least_2_nei_more_than_2_conn(self, mol: dm.Mol):
        """16. Number of atoms where at least 2 connected atoms have more than 2 connections"""
        n_atoms = 0
        for atom in mol.GetAtoms():
            tmp = [x for x in atom.GetNeighbors() if len(x.GetNeighbors()) > 2]
            n_atoms += len(tmp) > 2
        return n_atoms

    def abs_scaffold_format_charge(self, mol: dm.Mol):
        """17. absolute value of the scaffold formal charge"""
        charge = GetFormalCharge(mol)
        return abs(charge)

    def n_bonds(self, mol: dm.Mol):
        """18. number of bonds"""
        return mol.GetNumBonds()

    def n_multiple_non_conj_ring_bonds(self, mol: dm.Mol):
        """19. number of multiple, nonconjugated ring bonds"""
        extracted_rings = []
        nr_multiple_bonds_infcr = 0  # infcr: in not fully conjugated ring
        rings = Chem.GetSymmSSSR(mol)
        for i in range(len(rings)):
            extracted_rings.append(list(rings[i]))
        for ring in extracted_rings:
            if not _is_ring_fully_conjugated(mol, ring):
                nr_multiple_bonds_infcr += _n_multiple_bond_in_ring(mol, ring)
        return nr_multiple_bonds_infcr

    def n_bonds_2_heteroatoms(self, mol: dm.Mol):
        """20. number of bonds connecting 2 heteroatoms"""
        sm = dm.from_smarts("[!#1&!#6]~[!#1&!#6]")
        return len(mol.GetSubstructMatches(sm, uniquify=True))

    def n_carbon_het_carbon_het_bonds(self, mol: dm.Mol):
        """21. number of bonds connecting 2 heteroatoms through 2 carbons"""
        sm = dm.from_smarts("[!#1&!#6]~[#6]~[#6]~[!#1&!#6]")
        return len(mol.GetSubstructMatches(sm, uniquify=True))

    def n_bonds_at_least_3_conn(self, mol: dm.Mol):
        """22. number of bonds with at least 3 connections on both its atoms"""
        sm = dm.from_smarts("[$([!#1](~[!#1])(~[!#1])~[!#1])][$([!#1](~[!#1])(~[!#1])~[!#1])]")
        return len(mol.GetSubstructMatches(sm, uniquify=True))

    def n_exocyclic_single_bonds_carbon(self, mol: dm.Mol):
        """23. number of exocyclic single bonds where a ring atom is carbon"""
        sm = dm.from_smarts("[!R;!#1]-[#6;R]")
        return len(mol.GetSubstructMatches(sm, uniquify=True))

    def n_exocyclic_single_bonds_nitrogen(self, mol: dm.Mol):
        """24. number of exocyclic single bonds where a ring atom is nitrogen"""
        sm = dm.from_smarts("[!R;!#1]-[#7;R]")
        return len(mol.GetSubstructMatches(sm, uniquify=True))

    def n_non_ring_bonds_2_conj_rings(self, mol: dm.Mol):
        """25. number of non-ring bonds connecting 2 nonconjugated rings"""
        # EN: this is interpretated literally as bonds and not path
        ring_atom_conj_state = _ring_atom_state(mol)
        sm = dm.from_smarts("[R:1]!@[R:2]")
        bond_list = mol.GetSubstructMatches(sm, uniquify=True)
        result = 0
        for a_start, a_end in bond_list:
            s_state = ring_atom_conj_state.get(a_start)
            e_state = ring_atom_conj_state.get(a_end)
            if False in s_state and False in e_state:
                result += 1
        return result

    def n_non_ring_bonds_conj_nonconj_rings(self, mol: dm.Mol):
        """
        26. number of non-ring bonds connecting 2 rings,
        one of them conjugated and one non-conjugated
        """
        # EN: this is interpretated literally as bonds and not path

        ring_atom_conj_state = _ring_atom_state(mol)
        sm = dm.from_smarts("[R:1]!@[R:2]")
        bond_list = mol.GetSubstructMatches(sm, uniquify=True)
        result = 0
        for a_start, a_end in bond_list:
            s_state = ring_atom_conj_state.get(a_start)
            e_state = ring_atom_conj_state.get(a_end)
            if (True in s_state and False in e_state) or (False in s_state and True in e_state):
                result += 1
        return result

    def n_bonds_atoms_with_at_least_one_nei_with_2_conn(self, mol: dm.Mol):
        """
        27. number of bonds where both atoms have at least one neighbor
        (not considering the bond atoms) with more than 2 connections
        """
        result = 0
        huge_conn = list(
            itertools.chain(*mol.GetSubstructMatches(dm.from_smarts("[*;!D0;!D1;!D2]"), uniquify=1))
        )
        for bond in mol.GetBonds():
            a_start, a_end = bond.GetBeginAtom(), bond.GetEndAtom()
            # we need to exclud the bond atom themselves
            allowed_conn_table = [
                x for x in huge_conn if x not in [a_start.GetIdx(), a_end.GetIdx()]
            ]
            if any([x.GetIdx() in allowed_conn_table for x in a_start.GetNeighbors()]) and any(
                [y.GetIdx() in allowed_conn_table for y in a_end.GetNeighbors()]
            ):
                result += 1
        return result

    def n_simple_rings(self, mol: dm.Mol):
        """28. number of simple rings"""
        ri = mol.GetRingInfo()
        return ri.NumRings()

    def size_largest_ring(self, mol: dm.Mol):
        """29. Size of the largest ring"""
        ri = mol.GetRingInfo()
        max_ring_size = max((len(r) for r in ri.AtomRings()), default=0)
        return max_ring_size

    def n_simple_rings_no_heteroatoms(self, mol: dm.Mol):
        """30. number of simple rings with no heteroatoms"""
        ri = mol.GetRingInfo()
        n_heteros = _count_heteroatom_per_ring(mol, ri.AtomRings())
        return sum(1 for x in n_heteros if x == 0)

    def n_simple_rings_1_heteroatoms(self, mol: dm.Mol):
        """31. number of simple rings with 1 heteroatom"""
        ri = mol.GetRingInfo()
        n_heteros = _count_heteroatom_per_ring(mol, ri.AtomRings())
        return sum(1 for x in n_heteros if x == 1)

    def n_simple_rings_2_heteroatoms(self, mol: dm.Mol):
        """32. number of simple rings with 2 heteroatom"""
        ri = mol.GetRingInfo()
        n_heteros = _count_heteroatom_per_ring(mol, ri.AtomRings())
        return sum(1 for x in n_heteros if x == 2)

    def n_simple_rings_at_least_3_heteroatoms(self, mol: dm.Mol):
        """33. number of simple rings with 3 or more heteroatoms"""
        ri = mol.GetRingInfo()
        n_heteros = _count_heteroatom_per_ring(mol, ri.AtomRings())
        return sum(1 for x in n_heteros if x >= 3)

    def n_simple_non_conj_5_atoms_rings(self, mol: dm.Mol):
        """34. number of simple non-conjugated rings with 5 atoms"""
        ri = mol.GetRingInfo()
        n = 0
        for ring in ri.AtomRings():
            if not _is_ring_fully_conjugated(mol, ring) and len(ring) == 5:
                n += 1
        return n

    def n_simple_non_conj_6_atoms_rings(self, mol: dm.Mol):
        """35. number of simple non-conjugated rings with 6 atoms"""
        ri = mol.GetRingInfo()
        n = 0
        for ring in ri.AtomRings():
            if not _is_ring_fully_conjugated(mol, ring) and len(ring) == 6:
                n += 1
        return n

    def n_ring_system(self, mol: dm.Mol):
        """36. number of ring systems"""
        simple_rings, ring_system, _ = _get_ring_system(mol)
        return len(ring_system)

    def n_ring_system_with_2_non_conj_simple_ring(self, mol: dm.Mol):
        """37. number of rings systems with 2 non-conjugated simple rings"""
        simple_rings, _, ring_map = _get_ring_system(mol)
        conj_rings_map = dict(
            (i, _is_ring_fully_conjugated(mol, x)) for i, x in enumerate(simple_rings)
        )
        result = 0
        for ring_set in ring_map:
            n_not_conj = sum(not conj_rings_map[rnum] for rnum in ring_set)
            result += n_not_conj == 2
        return result

    def n_ring_system_with_2_conj_simple_ring(self, mol: dm.Mol):
        """38. number of rings systems with 2 conjugated simple rings"""
        simple_rings, _, ring_map = _get_ring_system(mol)
        conj_rings_map = dict(
            (i, _is_ring_fully_conjugated(mol, x)) for i, x in enumerate(simple_rings)
        )
        result = 0
        for ring_set in ring_map:
            n_conj = sum(conj_rings_map[rnum] for rnum in ring_set)
            result += n_conj == 2
        return result

    def n_ring_system_with_conj_non_conj_simple_ring(self, mol: dm.Mol):
        """39 number of ring system containing 2 simple rings, one conjugated and one nonconjugated"""
        simple_rings, _, ring_map = _get_ring_system(mol)
        conj_rings_map = dict(
            (i, _is_ring_fully_conjugated(mol, x)) for i, x in enumerate(simple_rings)
        )
        result = 0
        for ring_set in ring_map:
            if len(ring_set) == 2:
                n_conj = sum(conj_rings_map[rnum] for rnum in ring_set)
                result += n_conj == 1
        return result

    def n_ring_system_with_3_conj_simple_ring(self, mol: dm.Mol):
        """40. number of rings systems with 3 conjugated simple rings"""
        simple_rings, _, ring_map = _get_ring_system(mol)
        conj_rings_map = dict(
            (i, _is_ring_fully_conjugated(mol, x)) for i, x in enumerate(simple_rings)
        )
        result = 0
        for ring_set in ring_map:
            n_conj = sum(conj_rings_map[rnum] for rnum in ring_set)
            result += n_conj == 3
        return result

    def n_ring_system_with_3_non_conj_simple_ring(self, mol: dm.Mol):
        """41. number of rings systems with 3 non-conjugated simple rings"""
        simple_rings, _, ring_map = _get_ring_system(mol)
        conj_rings_map = dict(
            (i, _is_ring_fully_conjugated(mol, x)) for i, x in enumerate(simple_rings)
        )
        result = 0
        for ring_set in ring_map:
            n_not_conj = sum(not conj_rings_map[rnum] for rnum in ring_set)
            result += n_not_conj == 3
        return result

    def n_ring_system_with_greater_one_conj_nonconj_simple_ring(self, mol: dm.Mol):
        """42. number of ring system containing 3 simple rings, at least one conjugated and one nonconjugated"""
        simple_rings, _, ring_map = _get_ring_system(mol)
        conj_rings_map = dict(
            (i, _is_ring_fully_conjugated(mol, x)) for i, x in enumerate(simple_rings)
        )
        result = 0
        for ring_set in ring_map:
            if len(ring_set) == 3:
                n_conj = sum(conj_rings_map[rnum] for rnum in ring_set)
                result += n_conj in [1, 2]
        return result

    @property
    def columns(self):
        """Get the name of all the descriptors of this calculator"""
        return list(self.DESCRIPTORS)

    def __call__(self, mol: Union[dm.Mol, str]):
        r"""
        Compute the Fingerprint of a molecule

        Args:
            mol: the molecule of interest

        Returns:
            props (np.ndarray): list of computed rdkit molecular descriptors
        """
        mol = dm.to_mol(mol)
        if self.use_scaffold and mol is not None:
            mol = MurckoScaffold.GetScaffoldForMol(mol)

        props = []
        for k in self.DESCRIPTORS:
            try:
                fn = getattr(self, k)
                props.append(fn(mol))
            except Exception as e:
                if self.verbose:
                    logger.error(e)
                props.append(float("nan"))
        props = np.asarray(props)
        if self.normalize:
            return self.compute_normalization(props)
        return props

columns property

Get the name of all the descriptors of this calculator

__call__(mol)

Compute the Fingerprint of a molecule

Parameters:

Name Type Description Default
mol Union[Mol, str]

the molecule of interest

required

Returns:

Name Type Description
props ndarray

list of computed rdkit molecular descriptors

Source code in molfeat/calc/skeys.py
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
def __call__(self, mol: Union[dm.Mol, str]):
    r"""
    Compute the Fingerprint of a molecule

    Args:
        mol: the molecule of interest

    Returns:
        props (np.ndarray): list of computed rdkit molecular descriptors
    """
    mol = dm.to_mol(mol)
    if self.use_scaffold and mol is not None:
        mol = MurckoScaffold.GetScaffoldForMol(mol)

    props = []
    for k in self.DESCRIPTORS:
        try:
            fn = getattr(self, k)
            props.append(fn(mol))
        except Exception as e:
            if self.verbose:
                logger.error(e)
            props.append(float("nan"))
    props = np.asarray(props)
    if self.normalize:
        return self.compute_normalization(props)
    return props

__getstate__()

Get state of the scaffold key function

Source code in molfeat/calc/skeys.py
175
176
177
178
179
180
181
def __getstate__(self):
    """Get state of the scaffold key function"""
    state = {}
    state["normalize"] = self.normalize
    state["verbose"] = self.verbose
    state["use_scaffold"] = self.use_scaffold
    return state

__init__(normalize=False, verbose=False, use_scaffold=False, **kwargs)

Init of the scaffold key function

Parameters:

Name Type Description Default
normalize bool

whether to normalize the value of the feature

False
verbose bool

whether to log errors

False
use_scaffold bool

whether to convert the molecule into scaffold first

False
Source code in molfeat/calc/skeys.py
160
161
162
163
164
165
166
167
168
169
170
171
172
173
def __init__(
    self, normalize: bool = False, verbose: bool = False, use_scaffold: bool = False, **kwargs
):
    """
    Init of the scaffold key function

    Args:
        normalize: whether to normalize the value of the feature
        verbose: whether to log errors
        use_scaffold: whether to convert the molecule into scaffold first
    """
    self.normalize = normalize
    self.verbose = verbose
    self.use_scaffold = use_scaffold

abs_scaffold_format_charge(mol)

  1. absolute value of the scaffold formal charge
Source code in molfeat/calc/skeys.py
291
292
293
294
def abs_scaffold_format_charge(self, mol: dm.Mol):
    """17. absolute value of the scaffold formal charge"""
    charge = GetFormalCharge(mol)
    return abs(charge)

compute_normalization(features) classmethod

Normalize input features. The normalization parameters are computed by the scaffolds of 2.1M molecules from CHEMBL 29.

Source code in molfeat/calc/skeys.py
186
187
188
189
190
191
@classmethod
def compute_normalization(cls, features: np.ndarray):
    """Normalize input features. The normalization parameters are
    computed by the scaffolds of 2.1M molecules from CHEMBL 29.
    """
    return (features - cls.NORM_PARAMS["mean"]) / cls.NORM_PARAMS["std"]

n_atom_at_least_2_nei_more_than_2_conn(mol)

  1. Number of atoms where at least 2 connected atoms have more than 2 connections
Source code in molfeat/calc/skeys.py
283
284
285
286
287
288
289
def n_atom_at_least_2_nei_more_than_2_conn(self, mol: dm.Mol):
    """16. Number of atoms where at least 2 connected atoms have more than 2 connections"""
    n_atoms = 0
    for atom in mol.GetAtoms():
        tmp = [x for x in atom.GetNeighbors() if len(x.GetNeighbors()) > 2]
        n_atoms += len(tmp) > 2
    return n_atoms

n_atom_exocyclic(mol)

  1. number of exocyclic atoms (connected by multiple bonds to a ring)
Source code in molfeat/calc/skeys.py
225
226
227
228
def n_atom_exocyclic(self, mol: dm.Mol):
    """5. number of exocyclic atoms (connected by multiple bonds to a ring)"""
    sm = dm.from_smarts("[!r;!$(*-[r])&$(*~[r])]")
    return len(mol.GetSubstructMatches(sm, uniquify=True))

n_atom_in_chain(mol)

  1. number atoms in chains (not counting double-connected exo-chain atoms)
Source code in molfeat/calc/skeys.py
220
221
222
223
def n_atom_in_chain(self, mol: dm.Mol):
    """4. number atoms in chains (not counting double-connected exo-chain atoms)"""
    sm = dm.from_smarts("[!r;!$(*=[r])]")
    return len(mol.GetSubstructMatches(sm, uniquify=True))

n_atom_in_conjugated_ring(mol)

  1. number of atoms in conjugated rings
Source code in molfeat/calc/skeys.py
198
199
200
201
202
203
204
205
def n_atom_in_conjugated_ring(self, mol: dm.Mol):
    """2. number of atoms in conjugated rings"""
    ri = mol.GetRingInfo()
    n = 0
    for ring in ri.AtomRings():
        if _is_ring_fully_conjugated(mol, ring):
            n += len(ring)
    return n

n_atom_in_rings(mol)

  1. number of ring atoms
Source code in molfeat/calc/skeys.py
193
194
195
196
def n_atom_in_rings(self, mol: dm.Mol):
    """1. number of ring atoms"""
    sm = dm.from_smarts("[r]")
    return len(mol.GetSubstructMatches(sm, uniquify=True))

n_atom_spiro_atoms(mol)

  1. number of spiro atoms
Source code in molfeat/calc/skeys.py
266
267
268
def n_atom_spiro_atoms(self, mol: dm.Mol):
    """13. number of spiro atoms"""
    return Desc.CalcNumSpiroAtoms(mol)

n_atoms_not_in_conjugated_ring(mol)

  1. number of atoms not in conjugated rings (i.e. atoms in aliphatic rings and non-ring atoms)
Source code in molfeat/calc/skeys.py
207
208
209
210
211
212
213
214
215
216
217
218
def n_atoms_not_in_conjugated_ring(self, mol: dm.Mol):
    """
    3. number of atoms not in conjugated rings
    (i.e. atoms in aliphatic rings and non-ring atoms)
    """
    # EN: replace conjugation by aromatic
    ri = mol.GetRingInfo()
    n = 0
    for ring in ri.AtomRings():
        if not _is_ring_fully_conjugated(mol, ring):
            n += len(ring)
    return n

n_bonds(mol)

  1. number of bonds
Source code in molfeat/calc/skeys.py
296
297
298
def n_bonds(self, mol: dm.Mol):
    """18. number of bonds"""
    return mol.GetNumBonds()

n_bonds_2_heteroatoms(mol)

  1. number of bonds connecting 2 heteroatoms
Source code in molfeat/calc/skeys.py
312
313
314
315
def n_bonds_2_heteroatoms(self, mol: dm.Mol):
    """20. number of bonds connecting 2 heteroatoms"""
    sm = dm.from_smarts("[!#1&!#6]~[!#1&!#6]")
    return len(mol.GetSubstructMatches(sm, uniquify=True))

n_bonds_at_least_3_conn(mol)

  1. number of bonds with at least 3 connections on both its atoms
Source code in molfeat/calc/skeys.py
322
323
324
325
def n_bonds_at_least_3_conn(self, mol: dm.Mol):
    """22. number of bonds with at least 3 connections on both its atoms"""
    sm = dm.from_smarts("[$([!#1](~[!#1])(~[!#1])~[!#1])][$([!#1](~[!#1])(~[!#1])~[!#1])]")
    return len(mol.GetSubstructMatches(sm, uniquify=True))

n_bonds_atoms_with_at_least_one_nei_with_2_conn(mol)

  1. number of bonds where both atoms have at least one neighbor (not considering the bond atoms) with more than 2 connections
Source code in molfeat/calc/skeys.py
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
def n_bonds_atoms_with_at_least_one_nei_with_2_conn(self, mol: dm.Mol):
    """
    27. number of bonds where both atoms have at least one neighbor
    (not considering the bond atoms) with more than 2 connections
    """
    result = 0
    huge_conn = list(
        itertools.chain(*mol.GetSubstructMatches(dm.from_smarts("[*;!D0;!D1;!D2]"), uniquify=1))
    )
    for bond in mol.GetBonds():
        a_start, a_end = bond.GetBeginAtom(), bond.GetEndAtom()
        # we need to exclud the bond atom themselves
        allowed_conn_table = [
            x for x in huge_conn if x not in [a_start.GetIdx(), a_end.GetIdx()]
        ]
        if any([x.GetIdx() in allowed_conn_table for x in a_start.GetNeighbors()]) and any(
            [y.GetIdx() in allowed_conn_table for y in a_end.GetNeighbors()]
        ):
            result += 1
    return result

n_carbon_atleast_2_heteroatoms(mol)

  1. number of carbon atoms connected to at least 2 heteroatoms
Source code in molfeat/calc/skeys.py
275
276
277
278
279
280
281
def n_carbon_atleast_2_heteroatoms(self, mol: dm.Mol):
    """15. number of carbon atoms connected to at least 2 heteroatoms"""
    n_atoms = 0
    for atom in mol.GetAtoms():
        tmp = [x for x in atom.GetNeighbors() if x.GetAtomicNum() not in [1, 6]]
        n_atoms += len(tmp) >= 2
    return n_atoms

n_carbon_het_carbon_het_bonds(mol)

  1. number of bonds connecting 2 heteroatoms through 2 carbons
Source code in molfeat/calc/skeys.py
317
318
319
320
def n_carbon_het_carbon_het_bonds(self, mol: dm.Mol):
    """21. number of bonds connecting 2 heteroatoms through 2 carbons"""
    sm = dm.from_smarts("[!#1&!#6]~[#6]~[#6]~[!#1&!#6]")
    return len(mol.GetSubstructMatches(sm, uniquify=True))

n_exocyclic_single_bonds_carbon(mol)

  1. number of exocyclic single bonds where a ring atom is carbon
Source code in molfeat/calc/skeys.py
327
328
329
330
def n_exocyclic_single_bonds_carbon(self, mol: dm.Mol):
    """23. number of exocyclic single bonds where a ring atom is carbon"""
    sm = dm.from_smarts("[!R;!#1]-[#6;R]")
    return len(mol.GetSubstructMatches(sm, uniquify=True))

n_exocyclic_single_bonds_nitrogen(mol)

  1. number of exocyclic single bonds where a ring atom is nitrogen
Source code in molfeat/calc/skeys.py
332
333
334
335
def n_exocyclic_single_bonds_nitrogen(self, mol: dm.Mol):
    """24. number of exocyclic single bonds where a ring atom is nitrogen"""
    sm = dm.from_smarts("[!R;!#1]-[#7;R]")
    return len(mol.GetSubstructMatches(sm, uniquify=True))

n_heteroatom_more_than_2_conn(mol)

  1. number of heteroatoms with more than 2 connections
Source code in molfeat/calc/skeys.py
270
271
272
273
def n_heteroatom_more_than_2_conn(self, mol: dm.Mol):
    """14. number of heteroatoms with more than 2 connections"""
    sm = dm.from_smarts("[!#1;!#6;!D1!D0;!D2]")
    return len(mol.GetSubstructMatches(sm, uniquify=True))

n_heteroatoms(mol)

  1. number of heteroatoms
Source code in molfeat/calc/skeys.py
255
256
257
258
259
def n_heteroatoms(self, mol: dm.Mol):
    """11. number of heteroatoms"""

    sm = dm.from_smarts("[!#1&!#6]")
    return len(mol.GetSubstructMatches(sm, uniquify=True))

n_heteroatoms_in_ring(mol)

  1. number of heteroatoms in rings
Source code in molfeat/calc/skeys.py
261
262
263
264
def n_heteroatoms_in_ring(self, mol: dm.Mol):
    """12. number of heteroatoms in rings"""
    sm = dm.from_smarts("[!#1&!#6&r]")
    return len(mol.GetSubstructMatches(sm, uniquify=True))

n_multiple_non_conj_ring_bonds(mol)

  1. number of multiple, nonconjugated ring bonds
Source code in molfeat/calc/skeys.py
300
301
302
303
304
305
306
307
308
309
310
def n_multiple_non_conj_ring_bonds(self, mol: dm.Mol):
    """19. number of multiple, nonconjugated ring bonds"""
    extracted_rings = []
    nr_multiple_bonds_infcr = 0  # infcr: in not fully conjugated ring
    rings = Chem.GetSymmSSSR(mol)
    for i in range(len(rings)):
        extracted_rings.append(list(rings[i]))
    for ring in extracted_rings:
        if not _is_ring_fully_conjugated(mol, ring):
            nr_multiple_bonds_infcr += _n_multiple_bond_in_ring(mol, ring)
    return nr_multiple_bonds_infcr

n_nitrogen(mol)

  1. number of nitrogen
Source code in molfeat/calc/skeys.py
230
231
232
233
def n_nitrogen(self, mol: dm.Mol):
    """6. number of nitrogen"""
    sm = dm.from_smarts("[#7]")
    return len(mol.GetSubstructMatches(sm, uniquify=True))

n_nitrogen_in_ring(mol)

  1. number of nitrogen in rings
Source code in molfeat/calc/skeys.py
235
236
237
238
def n_nitrogen_in_ring(self, mol: dm.Mol):
    """7. number of nitrogen in rings"""
    sm = dm.from_smarts("[#7;r]")
    return len(mol.GetSubstructMatches(sm, uniquify=True))

n_non_ring_bonds_2_conj_rings(mol)

  1. number of non-ring bonds connecting 2 nonconjugated rings
Source code in molfeat/calc/skeys.py
337
338
339
340
341
342
343
344
345
346
347
348
349
def n_non_ring_bonds_2_conj_rings(self, mol: dm.Mol):
    """25. number of non-ring bonds connecting 2 nonconjugated rings"""
    # EN: this is interpretated literally as bonds and not path
    ring_atom_conj_state = _ring_atom_state(mol)
    sm = dm.from_smarts("[R:1]!@[R:2]")
    bond_list = mol.GetSubstructMatches(sm, uniquify=True)
    result = 0
    for a_start, a_end in bond_list:
        s_state = ring_atom_conj_state.get(a_start)
        e_state = ring_atom_conj_state.get(a_end)
        if False in s_state and False in e_state:
            result += 1
    return result

n_non_ring_bonds_conj_nonconj_rings(mol)

  1. number of non-ring bonds connecting 2 rings, one of them conjugated and one non-conjugated
Source code in molfeat/calc/skeys.py
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
def n_non_ring_bonds_conj_nonconj_rings(self, mol: dm.Mol):
    """
    26. number of non-ring bonds connecting 2 rings,
    one of them conjugated and one non-conjugated
    """
    # EN: this is interpretated literally as bonds and not path

    ring_atom_conj_state = _ring_atom_state(mol)
    sm = dm.from_smarts("[R:1]!@[R:2]")
    bond_list = mol.GetSubstructMatches(sm, uniquify=True)
    result = 0
    for a_start, a_end in bond_list:
        s_state = ring_atom_conj_state.get(a_start)
        e_state = ring_atom_conj_state.get(a_end)
        if (True in s_state and False in e_state) or (False in s_state and True in e_state):
            result += 1
    return result

n_oxygen(mol)

  1. number of oxygen
Source code in molfeat/calc/skeys.py
240
241
242
243
def n_oxygen(self, mol: dm.Mol):
    """8. number of oxygen"""
    sm = dm.from_smarts("[#8]")
    return len(mol.GetSubstructMatches(sm, uniquify=True))

n_oxygen_in_ring(mol)

  1. number of oxygen in rings
Source code in molfeat/calc/skeys.py
245
246
247
248
def n_oxygen_in_ring(self, mol: dm.Mol):
    """9. number of oxygen in rings"""
    sm = dm.from_smarts("[#8]")
    return len(mol.GetSubstructMatches(sm, uniquify=True))

n_ring_system(mol)

  1. number of ring systems
Source code in molfeat/calc/skeys.py
443
444
445
446
def n_ring_system(self, mol: dm.Mol):
    """36. number of ring systems"""
    simple_rings, ring_system, _ = _get_ring_system(mol)
    return len(ring_system)

n_ring_system_with_2_conj_simple_ring(mol)

  1. number of rings systems with 2 conjugated simple rings
Source code in molfeat/calc/skeys.py
460
461
462
463
464
465
466
467
468
469
470
def n_ring_system_with_2_conj_simple_ring(self, mol: dm.Mol):
    """38. number of rings systems with 2 conjugated simple rings"""
    simple_rings, _, ring_map = _get_ring_system(mol)
    conj_rings_map = dict(
        (i, _is_ring_fully_conjugated(mol, x)) for i, x in enumerate(simple_rings)
    )
    result = 0
    for ring_set in ring_map:
        n_conj = sum(conj_rings_map[rnum] for rnum in ring_set)
        result += n_conj == 2
    return result

n_ring_system_with_2_non_conj_simple_ring(mol)

  1. number of rings systems with 2 non-conjugated simple rings
Source code in molfeat/calc/skeys.py
448
449
450
451
452
453
454
455
456
457
458
def n_ring_system_with_2_non_conj_simple_ring(self, mol: dm.Mol):
    """37. number of rings systems with 2 non-conjugated simple rings"""
    simple_rings, _, ring_map = _get_ring_system(mol)
    conj_rings_map = dict(
        (i, _is_ring_fully_conjugated(mol, x)) for i, x in enumerate(simple_rings)
    )
    result = 0
    for ring_set in ring_map:
        n_not_conj = sum(not conj_rings_map[rnum] for rnum in ring_set)
        result += n_not_conj == 2
    return result

n_ring_system_with_3_conj_simple_ring(mol)

  1. number of rings systems with 3 conjugated simple rings
Source code in molfeat/calc/skeys.py
485
486
487
488
489
490
491
492
493
494
495
def n_ring_system_with_3_conj_simple_ring(self, mol: dm.Mol):
    """40. number of rings systems with 3 conjugated simple rings"""
    simple_rings, _, ring_map = _get_ring_system(mol)
    conj_rings_map = dict(
        (i, _is_ring_fully_conjugated(mol, x)) for i, x in enumerate(simple_rings)
    )
    result = 0
    for ring_set in ring_map:
        n_conj = sum(conj_rings_map[rnum] for rnum in ring_set)
        result += n_conj == 3
    return result

n_ring_system_with_3_non_conj_simple_ring(mol)

  1. number of rings systems with 3 non-conjugated simple rings
Source code in molfeat/calc/skeys.py
497
498
499
500
501
502
503
504
505
506
507
def n_ring_system_with_3_non_conj_simple_ring(self, mol: dm.Mol):
    """41. number of rings systems with 3 non-conjugated simple rings"""
    simple_rings, _, ring_map = _get_ring_system(mol)
    conj_rings_map = dict(
        (i, _is_ring_fully_conjugated(mol, x)) for i, x in enumerate(simple_rings)
    )
    result = 0
    for ring_set in ring_map:
        n_not_conj = sum(not conj_rings_map[rnum] for rnum in ring_set)
        result += n_not_conj == 3
    return result

n_ring_system_with_conj_non_conj_simple_ring(mol)

39 number of ring system containing 2 simple rings, one conjugated and one nonconjugated

Source code in molfeat/calc/skeys.py
472
473
474
475
476
477
478
479
480
481
482
483
def n_ring_system_with_conj_non_conj_simple_ring(self, mol: dm.Mol):
    """39 number of ring system containing 2 simple rings, one conjugated and one nonconjugated"""
    simple_rings, _, ring_map = _get_ring_system(mol)
    conj_rings_map = dict(
        (i, _is_ring_fully_conjugated(mol, x)) for i, x in enumerate(simple_rings)
    )
    result = 0
    for ring_set in ring_map:
        if len(ring_set) == 2:
            n_conj = sum(conj_rings_map[rnum] for rnum in ring_set)
            result += n_conj == 1
    return result

n_ring_system_with_greater_one_conj_nonconj_simple_ring(mol)

  1. number of ring system containing 3 simple rings, at least one conjugated and one nonconjugated
Source code in molfeat/calc/skeys.py
509
510
511
512
513
514
515
516
517
518
519
520
def n_ring_system_with_greater_one_conj_nonconj_simple_ring(self, mol: dm.Mol):
    """42. number of ring system containing 3 simple rings, at least one conjugated and one nonconjugated"""
    simple_rings, _, ring_map = _get_ring_system(mol)
    conj_rings_map = dict(
        (i, _is_ring_fully_conjugated(mol, x)) for i, x in enumerate(simple_rings)
    )
    result = 0
    for ring_set in ring_map:
        if len(ring_set) == 3:
            n_conj = sum(conj_rings_map[rnum] for rnum in ring_set)
            result += n_conj in [1, 2]
    return result

n_simple_non_conj_5_atoms_rings(mol)

  1. number of simple non-conjugated rings with 5 atoms
Source code in molfeat/calc/skeys.py
425
426
427
428
429
430
431
432
def n_simple_non_conj_5_atoms_rings(self, mol: dm.Mol):
    """34. number of simple non-conjugated rings with 5 atoms"""
    ri = mol.GetRingInfo()
    n = 0
    for ring in ri.AtomRings():
        if not _is_ring_fully_conjugated(mol, ring) and len(ring) == 5:
            n += 1
    return n

n_simple_non_conj_6_atoms_rings(mol)

  1. number of simple non-conjugated rings with 6 atoms
Source code in molfeat/calc/skeys.py
434
435
436
437
438
439
440
441
def n_simple_non_conj_6_atoms_rings(self, mol: dm.Mol):
    """35. number of simple non-conjugated rings with 6 atoms"""
    ri = mol.GetRingInfo()
    n = 0
    for ring in ri.AtomRings():
        if not _is_ring_fully_conjugated(mol, ring) and len(ring) == 6:
            n += 1
    return n

n_simple_rings(mol)

  1. number of simple rings
Source code in molfeat/calc/skeys.py
390
391
392
393
def n_simple_rings(self, mol: dm.Mol):
    """28. number of simple rings"""
    ri = mol.GetRingInfo()
    return ri.NumRings()

n_simple_rings_1_heteroatoms(mol)

  1. number of simple rings with 1 heteroatom
Source code in molfeat/calc/skeys.py
407
408
409
410
411
def n_simple_rings_1_heteroatoms(self, mol: dm.Mol):
    """31. number of simple rings with 1 heteroatom"""
    ri = mol.GetRingInfo()
    n_heteros = _count_heteroatom_per_ring(mol, ri.AtomRings())
    return sum(1 for x in n_heteros if x == 1)

n_simple_rings_2_heteroatoms(mol)

  1. number of simple rings with 2 heteroatom
Source code in molfeat/calc/skeys.py
413
414
415
416
417
def n_simple_rings_2_heteroatoms(self, mol: dm.Mol):
    """32. number of simple rings with 2 heteroatom"""
    ri = mol.GetRingInfo()
    n_heteros = _count_heteroatom_per_ring(mol, ri.AtomRings())
    return sum(1 for x in n_heteros if x == 2)

n_simple_rings_at_least_3_heteroatoms(mol)

  1. number of simple rings with 3 or more heteroatoms
Source code in molfeat/calc/skeys.py
419
420
421
422
423
def n_simple_rings_at_least_3_heteroatoms(self, mol: dm.Mol):
    """33. number of simple rings with 3 or more heteroatoms"""
    ri = mol.GetRingInfo()
    n_heteros = _count_heteroatom_per_ring(mol, ri.AtomRings())
    return sum(1 for x in n_heteros if x >= 3)

n_simple_rings_no_heteroatoms(mol)

  1. number of simple rings with no heteroatoms
Source code in molfeat/calc/skeys.py
401
402
403
404
405
def n_simple_rings_no_heteroatoms(self, mol: dm.Mol):
    """30. number of simple rings with no heteroatoms"""
    ri = mol.GetRingInfo()
    n_heteros = _count_heteroatom_per_ring(mol, ri.AtomRings())
    return sum(1 for x in n_heteros if x == 0)

n_sulfur(mol)

  1. number of sulfur atoms
Source code in molfeat/calc/skeys.py
250
251
252
253
def n_sulfur(self, mol: dm.Mol):
    """10. number of sulfur atoms"""
    sm = dm.from_smarts("[#16]")
    return len(mol.GetSubstructMatches(sm, uniquify=True))

size_largest_ring(mol)

  1. Size of the largest ring
Source code in molfeat/calc/skeys.py
395
396
397
398
399
def size_largest_ring(self, mol: dm.Mol):
    """29. Size of the largest ring"""
    ri = mol.GetRingInfo()
    max_ring_size = max((len(r) for r in ri.AtomRings()), default=0)
    return max_ring_size

skdistance(sk1, sk2, weights=None, cdist=False)

Compute the scaffold distance between two scaffold keys as described in https://pubs.acs.org/doi/abs/10.1021/ci5001983. The input features are expected to be normalized beforehand (see paper)

Parameters:

Name Type Description Default
sk1 ndarray

scaffold key 1

required
sk2 ndarray

scaffold key 2

required
weights Optional[ndarray]

how to weight each of the features. By default rank ordering is used.

None
cdist bool

whether to compute the features on a batched of inputs (expected 2D)

False

Returns:

Name Type Description
dist float

distance between two scaffold keys

Source code in molfeat/calc/skeys.py
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
def skdistance(
    sk1: np.ndarray,
    sk2: np.ndarray,
    weights: Optional[np.ndarray] = None,
    cdist: bool = False,
):
    """Compute the scaffold distance between two scaffold keys
    as described in https://pubs.acs.org/doi/abs/10.1021/ci5001983.
    The input features are expected to be normalized beforehand (see paper)

    Args:
        sk1: scaffold key 1
        sk2: scaffold key 2
        weights: how to weight each of the features. By default rank ordering is used.
        cdist: whether to compute the features on a batched of inputs (expected 2D)

    Returns:
        dist (float): distance between two scaffold keys
    """
    if weights is None:
        weights = 1 / (np.arange(sk1.shape[-1]) + 1)

    if cdist:
        sk1 = np.atleast_2d(sk1)
        sk2 = np.atleast_2d(sk2)
        val = np.abs(sk1[:, None] - sk2[:]) ** 1.5
        dist = np.sum(val * weights, axis=-1)
    else:
        if any((sk.ndim > 1 and sk.shape[0] != 1) for sk in [sk1, sk2]):
            raise ValueError("`cdist` mode was not detected, you need to provide single vectors")
        val = np.abs(sk1 - sk2) ** 1.5
        dist = np.sum(val * weights)
    return dist

Shape

ElectroShapeDescriptors

Bases: SerializableCalculator

Compute Electroshape descriptors as described by

Armstrong et al. ElectroShape: fast molecular similarity calculations incorporating shape, chirality and electrostatics. J Comput Aided Mol Des 24, 789-801 (2010). http://dx.doi.org/doi:10.1007/s10822-010-9374-0

Source code in molfeat/calc/shape.py
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
class ElectroShapeDescriptors(SerializableCalculator):
    """Compute Electroshape descriptors as described by

    Armstrong et al. ElectroShape: fast molecular similarity calculations incorporating shape, chirality and electrostatics.
    J Comput Aided Mol Des 24, 789-801 (2010). http://dx.doi.org/doi:10.1007/s10822-010-9374-0
    """

    SUPPORTED_CHARGE_MODELS = ["gasteiger", "tripos", "mmff94", "formal"]

    def __init__(
        self,
        charge_model: str = "gasteiger",
        replace_nan: bool = False,
        electron_scaling: float = 25.0,
        **kwargs,
    ):
        """Constructor for ElectroShape descriptor

        Args:
            charge_model: charge model to use. One of ('gasteiger', 'tripos', 'mmff94', 'formal'). Defaults to "gasteiger".
                Note that formal charges will be computed on the fly if not provided in the input molecules.
                The `tripos` charge models comes from TRIPOS force field and are often parsed from mol2 files.
            replace_nan: whether to replace NaN values. Defaults False
            electron_scaling: scaling factor to convert electron charges to Angstroms. Defaults to 25.0.
        """

        self.charge_model = charge_model
        self.replace_nan = replace_nan
        self.electron_scaling = electron_scaling
        self._columns = None

    @property
    def columns(self):
        """
        Get the name of all the descriptors of this calculator
        """
        if self._columns is None:
            self._columns = []
            for i in range(1, 6):
                self._columns.extend([f"dist-{i}-mean", f"dist-{i}-std", f"dist-{i}-crb"])

        return self._columns

    def __getstate__(self):
        state = {}
        state["charge_model"] = self.charge_model
        state["replace_nan"] = self.replace_nan
        state["electron_scaling"] = self.electron_scaling
        state["_columns"] = self._columns
        return state

    def __len__(self):
        """Return the length of the calculator"""
        return len(self.columns)

    @staticmethod
    def compute_charge(mol: Union[dm.Mol, str], charge_model: str = None):
        """
        Get the molecular charge of the molecule.

        Args:
            charge_model: charge model to use. One of ('gasteiger', 'tripos', 'mmff94', 'formal'). Defaults to "gasteiger".
        """

        if charge_model not in ElectroShapeDescriptors.SUPPORTED_CHARGE_MODELS:
            raise ValueError(
                f"Unknown charge model {charge_model}. You should provide one of {ElectroShapeDescriptors.SUPPORTED_CHARGE_MODELS}"
            )
        mol = dm.to_mol(mol)
        atom_charge = []
        atom_list = list(mol.GetAtoms())

        # force compute the partial charges if not provided
        if charge_model == "gasteiger" and not atom_list[0].HasProp("_GasteigerCharge"):
            rdPartialCharges.ComputeGasteigerCharges(mol)
        elif charge_model == "mmff94" and not atom_list[0].HasProp("_MMFF94Charge"):
            ff_infos = rdForceFieldHelpers.MMFFGetMoleculeProperties(mol)
            for i, atom in enumerate(atom_list):
                atom.SetDoubleProp("_MMFF94Charge", ff_infos.GetMMFFPartialCharge(i))

        for atom in mol.GetAtoms():
            if charge_model == "formal":
                atom_charge.append(atom.GetFormalCharge())
            elif charge_model == "gasteiger":
                atom_charge.append(atom.GetDoubleProp("_GasteigerCharge"))
            elif charge_model == "mmff94":
                atom_charge.append(atom.GetDoubleProp("_MMFF94Charge"))
            elif charge_model == "tripos":
                atom_charge.append(atom.GetDoubleProp("_TriposPartialCharge"))
        return np.asarray(atom_charge)

    @requires_conformer
    def __call__(self, mol: Union[dm.Mol, str], conformer_id: Optional[int] = -1):
        r"""
        Get rdkit 3D descriptors for a molecule

        Args:
            mol: the molecule of interest
            conformer_id (int, optional): Optional conformer id. Defaults to -1.

        Returns:
            shape_descriptor (np.ndarray): computed shape descriptor
        """

        mol = dm.to_mol(mol)
        coords = mol.GetConformer(conformer_id).GetPositions()
        charge = self.compute_charge(mol, self.charge_model)
        if self.replace_nan:
            charge = np.nan_to_num(charge)

        desc_4d = np.column_stack((coords, charge * self.electron_scaling))

        c1 = desc_4d.mean(axis=0)
        distances_c1 = norm(desc_4d - c1, axis=1)

        c2 = desc_4d[distances_c1.argmax()]  # atom position furthest from c1
        distances_c2 = norm(desc_4d - c2, axis=1)

        c3 = desc_4d[distances_c2.argmax()]  # atom position furthest from c2
        distances_c3 = norm(desc_4d - c3, axis=1)

        vector_a = c2 - c1
        vector_b = c3 - c1
        vector_as = vector_a[:3]  # spatial parts of these vectors
        vector_bs = vector_b[:3]  # spatial parts of these vectors
        cross_ab = np.cross(vector_as, vector_bs)
        vector_c = (norm(vector_a) / (2 * norm(cross_ab))) * cross_ab
        vector_c1s = c1[:3]

        max_charge = np.array(np.amax(charge) * self.electron_scaling)
        min_charge = np.array(np.amin(charge) * self.electron_scaling)

        c4 = np.append(vector_c1s + vector_c, max_charge)
        c5 = np.append(vector_c1s + vector_c, min_charge)

        distances_c4 = norm(desc_4d - c4, axis=1)
        distances_c5 = norm(desc_4d - c5, axis=1)

        distances_list = [
            distances_c1,
            distances_c2,
            distances_c3,
            distances_c4,
            distances_c5,
        ]

        shape_descriptor = np.zeros(15)

        i = 0
        for distances in distances_list:
            mean = np.mean(distances)
            shape_descriptor[0 + i] = mean
            shape_descriptor[1 + i] = np.std(distances)
            shape_descriptor[2 + i] = cbrt(np.sum(((distances - mean) ** 3) / distances.size))
            i += 3
        if self.replace_nan:
            return np.nan_to_num(shape_descriptor)
        return shape_descriptor

columns property

Get the name of all the descriptors of this calculator

__call__(mol, conformer_id=-1)

Get rdkit 3D descriptors for a molecule

Parameters:

Name Type Description Default
mol Union[Mol, str]

the molecule of interest

required
conformer_id int

Optional conformer id. Defaults to -1.

-1

Returns:

Name Type Description
shape_descriptor ndarray

computed shape descriptor

Source code in molfeat/calc/shape.py
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
@requires_conformer
def __call__(self, mol: Union[dm.Mol, str], conformer_id: Optional[int] = -1):
    r"""
    Get rdkit 3D descriptors for a molecule

    Args:
        mol: the molecule of interest
        conformer_id (int, optional): Optional conformer id. Defaults to -1.

    Returns:
        shape_descriptor (np.ndarray): computed shape descriptor
    """

    mol = dm.to_mol(mol)
    coords = mol.GetConformer(conformer_id).GetPositions()
    charge = self.compute_charge(mol, self.charge_model)
    if self.replace_nan:
        charge = np.nan_to_num(charge)

    desc_4d = np.column_stack((coords, charge * self.electron_scaling))

    c1 = desc_4d.mean(axis=0)
    distances_c1 = norm(desc_4d - c1, axis=1)

    c2 = desc_4d[distances_c1.argmax()]  # atom position furthest from c1
    distances_c2 = norm(desc_4d - c2, axis=1)

    c3 = desc_4d[distances_c2.argmax()]  # atom position furthest from c2
    distances_c3 = norm(desc_4d - c3, axis=1)

    vector_a = c2 - c1
    vector_b = c3 - c1
    vector_as = vector_a[:3]  # spatial parts of these vectors
    vector_bs = vector_b[:3]  # spatial parts of these vectors
    cross_ab = np.cross(vector_as, vector_bs)
    vector_c = (norm(vector_a) / (2 * norm(cross_ab))) * cross_ab
    vector_c1s = c1[:3]

    max_charge = np.array(np.amax(charge) * self.electron_scaling)
    min_charge = np.array(np.amin(charge) * self.electron_scaling)

    c4 = np.append(vector_c1s + vector_c, max_charge)
    c5 = np.append(vector_c1s + vector_c, min_charge)

    distances_c4 = norm(desc_4d - c4, axis=1)
    distances_c5 = norm(desc_4d - c5, axis=1)

    distances_list = [
        distances_c1,
        distances_c2,
        distances_c3,
        distances_c4,
        distances_c5,
    ]

    shape_descriptor = np.zeros(15)

    i = 0
    for distances in distances_list:
        mean = np.mean(distances)
        shape_descriptor[0 + i] = mean
        shape_descriptor[1 + i] = np.std(distances)
        shape_descriptor[2 + i] = cbrt(np.sum(((distances - mean) ** 3) / distances.size))
        i += 3
    if self.replace_nan:
        return np.nan_to_num(shape_descriptor)
    return shape_descriptor

__init__(charge_model='gasteiger', replace_nan=False, electron_scaling=25.0, **kwargs)

Constructor for ElectroShape descriptor

Parameters:

Name Type Description Default
charge_model str

charge model to use. One of ('gasteiger', 'tripos', 'mmff94', 'formal'). Defaults to "gasteiger". Note that formal charges will be computed on the fly if not provided in the input molecules. The tripos charge models comes from TRIPOS force field and are often parsed from mol2 files.

'gasteiger'
replace_nan bool

whether to replace NaN values. Defaults False

False
electron_scaling float

scaling factor to convert electron charges to Angstroms. Defaults to 25.0.

25.0
Source code in molfeat/calc/shape.py
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
def __init__(
    self,
    charge_model: str = "gasteiger",
    replace_nan: bool = False,
    electron_scaling: float = 25.0,
    **kwargs,
):
    """Constructor for ElectroShape descriptor

    Args:
        charge_model: charge model to use. One of ('gasteiger', 'tripos', 'mmff94', 'formal'). Defaults to "gasteiger".
            Note that formal charges will be computed on the fly if not provided in the input molecules.
            The `tripos` charge models comes from TRIPOS force field and are often parsed from mol2 files.
        replace_nan: whether to replace NaN values. Defaults False
        electron_scaling: scaling factor to convert electron charges to Angstroms. Defaults to 25.0.
    """

    self.charge_model = charge_model
    self.replace_nan = replace_nan
    self.electron_scaling = electron_scaling
    self._columns = None

__len__()

Return the length of the calculator

Source code in molfeat/calc/shape.py
130
131
132
def __len__(self):
    """Return the length of the calculator"""
    return len(self.columns)

compute_charge(mol, charge_model=None) staticmethod

Get the molecular charge of the molecule.

Parameters:

Name Type Description Default
charge_model str

charge model to use. One of ('gasteiger', 'tripos', 'mmff94', 'formal'). Defaults to "gasteiger".

None
Source code in molfeat/calc/shape.py
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
@staticmethod
def compute_charge(mol: Union[dm.Mol, str], charge_model: str = None):
    """
    Get the molecular charge of the molecule.

    Args:
        charge_model: charge model to use. One of ('gasteiger', 'tripos', 'mmff94', 'formal'). Defaults to "gasteiger".
    """

    if charge_model not in ElectroShapeDescriptors.SUPPORTED_CHARGE_MODELS:
        raise ValueError(
            f"Unknown charge model {charge_model}. You should provide one of {ElectroShapeDescriptors.SUPPORTED_CHARGE_MODELS}"
        )
    mol = dm.to_mol(mol)
    atom_charge = []
    atom_list = list(mol.GetAtoms())

    # force compute the partial charges if not provided
    if charge_model == "gasteiger" and not atom_list[0].HasProp("_GasteigerCharge"):
        rdPartialCharges.ComputeGasteigerCharges(mol)
    elif charge_model == "mmff94" and not atom_list[0].HasProp("_MMFF94Charge"):
        ff_infos = rdForceFieldHelpers.MMFFGetMoleculeProperties(mol)
        for i, atom in enumerate(atom_list):
            atom.SetDoubleProp("_MMFF94Charge", ff_infos.GetMMFFPartialCharge(i))

    for atom in mol.GetAtoms():
        if charge_model == "formal":
            atom_charge.append(atom.GetFormalCharge())
        elif charge_model == "gasteiger":
            atom_charge.append(atom.GetDoubleProp("_GasteigerCharge"))
        elif charge_model == "mmff94":
            atom_charge.append(atom.GetDoubleProp("_MMFF94Charge"))
        elif charge_model == "tripos":
            atom_charge.append(atom.GetDoubleProp("_TriposPartialCharge"))
    return np.asarray(atom_charge)

USRDescriptors

Bases: SerializableCalculator

Descriptors for the shape of a molecule.

!!! note: The following shape descriptors are offered: * USR: UltraFast Shape Recognition * USRCAT: Ultrafast Shape Recognition with CREDO Atom Types

Source code in molfeat/calc/shape.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
class USRDescriptors(SerializableCalculator):
    """Descriptors for the shape of a molecule.

    !!! note:
        The following shape descriptors are offered:
            * USR: UltraFast Shape Recognition
            * USRCAT: Ultrafast Shape Recognition with CREDO Atom Types
    """

    def __init__(self, method: str = "USR", replace_nan: bool = False, **kwargs):
        """Constructor for ShapeDescriptors

        Args:
            method: Shape descriptor method to use. One of 'USR', 'USRCAT'. Default to 'USR'
            replace_nan: Whether to replace nan or infinite values. Defaults to False.
        """
        self.method = method.upper()
        if self.method not in ["USR", "USRCAT"]:
            raise ValueError(f"Shape descriptor {self.method} is not supported")
        self.replace_nan = replace_nan
        self._columns = None

    def __getstate__(self):
        state = {}
        state["method"] = self.method
        state["replace_nan"] = self.replace_nan
        state["_columns"] = self._columns
        return state

    @property
    def columns(self):
        """
        Get the name of all the descriptors of this calculator
        """
        if self._columns is None:
            if self.method == "USR":
                self._columns = [f"usr-{i}" for i in range(1, 13)]
            elif self.method == "USRCAT":
                self._columns = [f"usr-{i}" for i in range(1, 61)]
        return self._columns

    def __len__(self):
        """Compute descriptors length"""
        return len(self.columns)

    @requires_conformer
    def __call__(self, mol: Union[dm.Mol, str], conformer_id: Optional[int] = -1) -> np.ndarray:
        r"""
        Get rdkit 3D descriptors for a molecule

        Args:
            mol: the molecule of interest
            conformer_id: Optional conformer id. Defaults to -1.

        Returns:
            shape_descriptors: list of computed molecular descriptors
        """
        if self.method == "USR":
            shape_descr = rdMolDescriptors.GetUSR(mol, confId=conformer_id)
        elif self.method == "USRCAT":
            shape_descr = rdMolDescriptors.GetUSRCAT(mol, confId=conformer_id)
        if self.replace_nan:
            shape_descr = np.nan_to_num(shape_descr, self.replace_nan)
        return np.asarray(shape_descr)

columns property

Get the name of all the descriptors of this calculator

__call__(mol, conformer_id=-1)

Get rdkit 3D descriptors for a molecule

Parameters:

Name Type Description Default
mol Union[Mol, str]

the molecule of interest

required
conformer_id Optional[int]

Optional conformer id. Defaults to -1.

-1

Returns:

Name Type Description
shape_descriptors ndarray

list of computed molecular descriptors

Source code in molfeat/calc/shape.py
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
@requires_conformer
def __call__(self, mol: Union[dm.Mol, str], conformer_id: Optional[int] = -1) -> np.ndarray:
    r"""
    Get rdkit 3D descriptors for a molecule

    Args:
        mol: the molecule of interest
        conformer_id: Optional conformer id. Defaults to -1.

    Returns:
        shape_descriptors: list of computed molecular descriptors
    """
    if self.method == "USR":
        shape_descr = rdMolDescriptors.GetUSR(mol, confId=conformer_id)
    elif self.method == "USRCAT":
        shape_descr = rdMolDescriptors.GetUSRCAT(mol, confId=conformer_id)
    if self.replace_nan:
        shape_descr = np.nan_to_num(shape_descr, self.replace_nan)
    return np.asarray(shape_descr)

__init__(method='USR', replace_nan=False, **kwargs)

Constructor for ShapeDescriptors

Parameters:

Name Type Description Default
method str

Shape descriptor method to use. One of 'USR', 'USRCAT'. Default to 'USR'

'USR'
replace_nan bool

Whether to replace nan or infinite values. Defaults to False.

False
Source code in molfeat/calc/shape.py
22
23
24
25
26
27
28
29
30
31
32
33
def __init__(self, method: str = "USR", replace_nan: bool = False, **kwargs):
    """Constructor for ShapeDescriptors

    Args:
        method: Shape descriptor method to use. One of 'USR', 'USRCAT'. Default to 'USR'
        replace_nan: Whether to replace nan or infinite values. Defaults to False.
    """
    self.method = method.upper()
    if self.method not in ["USR", "USRCAT"]:
        raise ValueError(f"Shape descriptor {self.method} is not supported")
    self.replace_nan = replace_nan
    self._columns = None

__len__()

Compute descriptors length

Source code in molfeat/calc/shape.py
54
55
56
def __len__(self):
    """Compute descriptors length"""
    return len(self.columns)

usrdistance(shape_1, shape_2, weights=None)

Computes similarity between molecules

Parameters:

Name Type Description Default
shape_1

USR shape descriptor of first molecule

required
shape_2

USR shape descriptor

required
weights Optional[List[float]]

List of scaling factor to use for

None

Returns:

Name Type Description
dist

Distance [0-1] between shapes of molecules, 0 indicates identical molecules

Source code in molfeat/calc/shape.py
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
def usrdistance(
    shape_1,
    shape_2,
    weights: Optional[List[float]] = None,
):
    """Computes similarity between molecules

    Args:
        shape_1: USR shape descriptor of first molecule
        shape_2: USR shape descriptor
        weights: List of scaling factor to use for

    Returns:
        dist: Distance [0-1] between shapes of molecules, 0 indicates identical molecules
    """

    # case for usr shape descriptors
    if weights is None:
        weights = []
    if (
        (shape_1.shape[-1] == shape_2.shape[-1] == 12)
        or (shape_1.shape[-1] == shape_2.shape[-1] == 60)
        or (shape_1.shape[-1] == shape_2.shape[-1] == 15)
    ):
        dist = rdMolDescriptors.GetUSRScore(shape_1, shape_2, weights=weights)
        return dist

    raise Exception(
        "Given vectors are not valid USR shape descriptors "
        "or come from different methods. Correct vector lengths"
        "are: 12 for USR, 60 for USRCAT, 15 for Electroshape"
    )

Atoms Featurizer

AtomCalculator

Bases: SerializableCalculator

Base class for computing atom properties compatible with DGLLife

Source code in molfeat/calc/atom.py
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
class AtomCalculator(SerializableCalculator):
    """
    Base class for computing atom properties compatible with DGLLife
    """

    DEFAULT_FEATURIZER = {
        "atom_one_hot": atom_one_hot,
        "atom_degree_one_hot": atom_degree_one_hot,
        "atom_implicit_valence_one_hot": atom_implicit_valence_one_hot,
        "atom_hybridization_one_hot": atom_hybridization_one_hot,
        "atom_is_aromatic": atom_is_aromatic,
        "atom_formal_charge": atom_formal_charge,
        "atom_num_radical_electrons": atom_num_radical_electrons,
        "atom_is_in_ring": atom_is_in_ring,
        "atom_total_num_H_one_hot": atom_total_num_H_one_hot,
        "atom_chiral_tag_one_hot": atom_chiral_tag_one_hot,
        "atom_is_chiral_center": atom_is_chiral_center,
    }

    def __init__(
        self,
        featurizer_funcs: Dict[str, Callable] = None,
        concat: bool = True,
        name: str = "hv",
    ):
        """
        Init function of the atom property calculator

        Args:
            featurizer_funcs : Mapping of feature name to the featurization function.
                For compatibility a list of callable/function is still accepted, and the corresponding
                featurizer name will be automatically generated. Each function is of signature
                ``func(dm.Atom) -> list or 1D numpy array``.
            concat: Whether to concat all the data into a single value in the output dict
            name: Name of the key name of the concatenated features
        """
        self._input_kwargs = locals().copy()
        self._input_kwargs.pop("self")
        # we also remove the featurizer funcs
        self._input_kwargs.pop("featurizer_funcs", None)
        self._toy_mol = dm.to_mol("CCO")
        self._feat_sizes = dict()
        if featurizer_funcs is None:
            featurizer_funcs = self.DEFAULT_FEATURIZER
        if not isinstance(featurizer_funcs, dict):
            get_name = lambda x: getattr(x, "__name__", repr(x))
            featurizer_funcs = dict((get_name(x), x) for x in featurizer_funcs)
        self.featurizer_funcs = featurizer_funcs
        for k in self.featurizer_funcs.keys():
            self.feat_size(feat_name=k)
        self.concat = concat
        self.name = name

    def to_state_dict(self):
        """
        Convert the Atom calculator to a state dict
        Due to some constraints and cross-version compatibility,  the featurizer functions
        need to be pickled and not just return a list
        """
        state_dict = {}
        state_dict["name"] = self.__class__.__name__
        state_dict["module"] = self.__class__.__module__
        state_dict["args"] = self._input_kwargs
        featurizer_fn_pickled = {}
        for fname, ffunc in self.featurizer_funcs.items():
            featurizer_fn_pickled[fname] = fn_to_hex(ffunc)
        state_dict["args"]["featurizer_funcs"] = featurizer_fn_pickled
        state_dict["_molfeat_version"] = MOLFEAT_VERSION

        signature = inspect.signature(self.__init__)
        val = {
            k: v.default
            for k, v in signature.parameters.items()
            # if v.default is not inspect.Parameter.empty
        }
        to_remove = [k for k in state_dict["args"] if k not in val.keys()]
        for k in to_remove:
            state_dict["args"].pop(k)

        return state_dict

    @classmethod
    def from_state_dict(cls, state_dict, override_args: Optional[dict] = None):
        """Create an instance of an atom calculator from a state dict

        Args:
            state_dict: state dictionary to use to create the atom calculator
            override_args: optional dictionary of arguments to override the ones in the state dict
                at construction of the new object
        """
        # EN: at this moment, version compatibility is not enforced
        cls_name = state_dict.get("name", cls.__name__)
        module_name = state_dict.get("module", cls.__module__)
        module = importlib.import_module(module_name)
        klass = getattr(module, cls_name)
        kwargs = state_dict["args"].copy()
        # now we need to unpickle the featurizer functions
        featurizer_fn_pickled = kwargs.pop("featurizer_funcs", None)
        if featurizer_fn_pickled is not None:
            featurizer_fn_loaded = {}
            for k, v in featurizer_fn_pickled.items():
                featurizer_fn_loaded[k] = hex_to_fn(v)
            kwargs["featurizer_funcs"] = featurizer_fn_loaded
        kwargs.update(**(override_args or {}))
        return klass(**kwargs)

    def _concat(self, data_dict: Dict[str, Iterable]):
        """Concatenate the data into a single value

        Args:
            data_dict: mapping of feature names to tensor/arrays
        Returns:
            concatenated_dict: a dict with a single key where all array have been concatenated
        """
        return concat_dict(data_dict, new_name=self.name)

    def feat_size(self, feat_name=None):
        """Get the feature size for ``feat_name``.

        When there is only one feature, users do not need to provide ``feat_name``.

        Args:
            feat_name: Feature for query.

        Returns:
            int: Feature size for the feature with name ``feat_name``. Default to None.
        """
        if feat_name is None:
            assert (
                len(self.featurizer_funcs) == 1
            ), "feat_name should be provided if there are more than one features"
            feat_name = list(self.featurizer_funcs.keys())[0]

        if feat_name not in self.featurizer_funcs:
            raise ValueError(
                "Expect feat_name to be in {}, got {}".format(
                    list(self.featurizer_funcs.keys()), feat_name
                )
            )

        if feat_name not in self._feat_sizes:
            atom = self._toy_mol.GetAtomWithIdx(0)
            self._feat_sizes[feat_name] = len(self.featurizer_funcs[feat_name](atom))
        return self._feat_sizes[feat_name]

    def __len__(self):
        """Get length of the property estimator"""
        return sum(v for k, v in self._feat_sizes.items() if k != self.name)

    def __call__(self, mol: Union[dm.Mol, str], dtype: Callable = None):
        """
        Get rdkit basic descriptors for a molecule

        Args:
            mol: the molecule of interest
            dtype: requested data type

        Returns:
            dict:  For each function in self.featurizer_funcs with the key ``k``, store the computed feature under the key ``k``.
        """
        mol = dm.to_mol(mol)
        num_atoms = mol.GetNumAtoms()
        atom_features = defaultdict(list)

        # Compute features for each atom
        for i in range(num_atoms):
            atom = mol.GetAtomWithIdx(i)
            for feat_name, feat_func in self.featurizer_funcs.items():
                atom_features[feat_name].append(feat_func(atom))

        # Stack the features and convert them to float arrays
        processed_features = dict()
        for feat_name, feat_list in atom_features.items():
            feat = np.stack(feat_list).astype(np.float32)
            processed_features[feat_name] = feat

        if self.concat:
            processed_features = self._concat(processed_features)

        if dtype is not None:
            for feat_name, feat in processed_features.items():
                feat = datatype.cast(feat, dtype=dtype)
                processed_features[feat_name] = feat

        return processed_features

__call__(mol, dtype=None)

Get rdkit basic descriptors for a molecule

Parameters:

Name Type Description Default
mol Union[Mol, str]

the molecule of interest

required
dtype Callable

requested data type

None

Returns:

Name Type Description
dict

For each function in self.featurizer_funcs with the key k, store the computed feature under the key k.

Source code in molfeat/calc/atom.py
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
def __call__(self, mol: Union[dm.Mol, str], dtype: Callable = None):
    """
    Get rdkit basic descriptors for a molecule

    Args:
        mol: the molecule of interest
        dtype: requested data type

    Returns:
        dict:  For each function in self.featurizer_funcs with the key ``k``, store the computed feature under the key ``k``.
    """
    mol = dm.to_mol(mol)
    num_atoms = mol.GetNumAtoms()
    atom_features = defaultdict(list)

    # Compute features for each atom
    for i in range(num_atoms):
        atom = mol.GetAtomWithIdx(i)
        for feat_name, feat_func in self.featurizer_funcs.items():
            atom_features[feat_name].append(feat_func(atom))

    # Stack the features and convert them to float arrays
    processed_features = dict()
    for feat_name, feat_list in atom_features.items():
        feat = np.stack(feat_list).astype(np.float32)
        processed_features[feat_name] = feat

    if self.concat:
        processed_features = self._concat(processed_features)

    if dtype is not None:
        for feat_name, feat in processed_features.items():
            feat = datatype.cast(feat, dtype=dtype)
            processed_features[feat_name] = feat

    return processed_features

__init__(featurizer_funcs=None, concat=True, name='hv')

Init function of the atom property calculator

Parameters:

Name Type Description Default
featurizer_funcs

Mapping of feature name to the featurization function. For compatibility a list of callable/function is still accepted, and the corresponding featurizer name will be automatically generated. Each function is of signature func(dm.Atom) -> list or 1D numpy array.

None
concat bool

Whether to concat all the data into a single value in the output dict

True
name str

Name of the key name of the concatenated features

'hv'
Source code in molfeat/calc/atom.py
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
def __init__(
    self,
    featurizer_funcs: Dict[str, Callable] = None,
    concat: bool = True,
    name: str = "hv",
):
    """
    Init function of the atom property calculator

    Args:
        featurizer_funcs : Mapping of feature name to the featurization function.
            For compatibility a list of callable/function is still accepted, and the corresponding
            featurizer name will be automatically generated. Each function is of signature
            ``func(dm.Atom) -> list or 1D numpy array``.
        concat: Whether to concat all the data into a single value in the output dict
        name: Name of the key name of the concatenated features
    """
    self._input_kwargs = locals().copy()
    self._input_kwargs.pop("self")
    # we also remove the featurizer funcs
    self._input_kwargs.pop("featurizer_funcs", None)
    self._toy_mol = dm.to_mol("CCO")
    self._feat_sizes = dict()
    if featurizer_funcs is None:
        featurizer_funcs = self.DEFAULT_FEATURIZER
    if not isinstance(featurizer_funcs, dict):
        get_name = lambda x: getattr(x, "__name__", repr(x))
        featurizer_funcs = dict((get_name(x), x) for x in featurizer_funcs)
    self.featurizer_funcs = featurizer_funcs
    for k in self.featurizer_funcs.keys():
        self.feat_size(feat_name=k)
    self.concat = concat
    self.name = name

__len__()

Get length of the property estimator

Source code in molfeat/calc/atom.py
191
192
193
def __len__(self):
    """Get length of the property estimator"""
    return sum(v for k, v in self._feat_sizes.items() if k != self.name)

feat_size(feat_name=None)

Get the feature size for feat_name.

When there is only one feature, users do not need to provide feat_name.

Parameters:

Name Type Description Default
feat_name

Feature for query.

None

Returns:

Name Type Description
int

Feature size for the feature with name feat_name. Default to None.

Source code in molfeat/calc/atom.py
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
def feat_size(self, feat_name=None):
    """Get the feature size for ``feat_name``.

    When there is only one feature, users do not need to provide ``feat_name``.

    Args:
        feat_name: Feature for query.

    Returns:
        int: Feature size for the feature with name ``feat_name``. Default to None.
    """
    if feat_name is None:
        assert (
            len(self.featurizer_funcs) == 1
        ), "feat_name should be provided if there are more than one features"
        feat_name = list(self.featurizer_funcs.keys())[0]

    if feat_name not in self.featurizer_funcs:
        raise ValueError(
            "Expect feat_name to be in {}, got {}".format(
                list(self.featurizer_funcs.keys()), feat_name
            )
        )

    if feat_name not in self._feat_sizes:
        atom = self._toy_mol.GetAtomWithIdx(0)
        self._feat_sizes[feat_name] = len(self.featurizer_funcs[feat_name](atom))
    return self._feat_sizes[feat_name]

from_state_dict(state_dict, override_args=None) classmethod

Create an instance of an atom calculator from a state dict

Parameters:

Name Type Description Default
state_dict

state dictionary to use to create the atom calculator

required
override_args Optional[dict]

optional dictionary of arguments to override the ones in the state dict at construction of the new object

None
Source code in molfeat/calc/atom.py
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
@classmethod
def from_state_dict(cls, state_dict, override_args: Optional[dict] = None):
    """Create an instance of an atom calculator from a state dict

    Args:
        state_dict: state dictionary to use to create the atom calculator
        override_args: optional dictionary of arguments to override the ones in the state dict
            at construction of the new object
    """
    # EN: at this moment, version compatibility is not enforced
    cls_name = state_dict.get("name", cls.__name__)
    module_name = state_dict.get("module", cls.__module__)
    module = importlib.import_module(module_name)
    klass = getattr(module, cls_name)
    kwargs = state_dict["args"].copy()
    # now we need to unpickle the featurizer functions
    featurizer_fn_pickled = kwargs.pop("featurizer_funcs", None)
    if featurizer_fn_pickled is not None:
        featurizer_fn_loaded = {}
        for k, v in featurizer_fn_pickled.items():
            featurizer_fn_loaded[k] = hex_to_fn(v)
        kwargs["featurizer_funcs"] = featurizer_fn_loaded
    kwargs.update(**(override_args or {}))
    return klass(**kwargs)

to_state_dict()

Convert the Atom calculator to a state dict Due to some constraints and cross-version compatibility, the featurizer functions need to be pickled and not just return a list

Source code in molfeat/calc/atom.py
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
def to_state_dict(self):
    """
    Convert the Atom calculator to a state dict
    Due to some constraints and cross-version compatibility,  the featurizer functions
    need to be pickled and not just return a list
    """
    state_dict = {}
    state_dict["name"] = self.__class__.__name__
    state_dict["module"] = self.__class__.__module__
    state_dict["args"] = self._input_kwargs
    featurizer_fn_pickled = {}
    for fname, ffunc in self.featurizer_funcs.items():
        featurizer_fn_pickled[fname] = fn_to_hex(ffunc)
    state_dict["args"]["featurizer_funcs"] = featurizer_fn_pickled
    state_dict["_molfeat_version"] = MOLFEAT_VERSION

    signature = inspect.signature(self.__init__)
    val = {
        k: v.default
        for k, v in signature.parameters.items()
        # if v.default is not inspect.Parameter.empty
    }
    to_remove = [k for k in state_dict["args"] if k not in val.keys()]
    for k in to_remove:
        state_dict["args"].pop(k)

    return state_dict

AtomMaterialCalculator

Bases: AtomCalculator

Atom calculator with the extend atomic property list which have been collected from various material science packages

Source code in molfeat/calc/atom.py
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
class AtomMaterialCalculator(AtomCalculator):
    """Atom calculator with the extend atomic property list
    which have been collected from various material science packages
    """

    DEFAULT_FEATURIZER = {
        "atom_one_hot": atom_one_hot,
        "atom_extended_properties": atom_extended_properties,
        "atom_degree_one_hot": atom_degree_one_hot,
        "atom_implicit_valence_one_hot": atom_implicit_valence_one_hot,
        "atom_hybridization_one_hot": atom_hybridization_one_hot,
        "atom_is_aromatic": atom_is_aromatic,
        "atom_formal_charge": atom_formal_charge,
        "atom_num_radical_electrons": atom_num_radical_electrons,
        "atom_is_in_ring": atom_is_in_ring,
        "atom_chiral_tag_one_hot": atom_chiral_tag_one_hot,
        "atom_is_chiral_center": atom_is_chiral_center,
    }

DGLCanonicalAtomCalculator

Bases: AtomCalculator

Default canonical featurizer for atoms used by dgllife

Source code in molfeat/calc/atom.py
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
class DGLCanonicalAtomCalculator(AtomCalculator):
    """Default canonical featurizer for atoms used by dgllife"""

    DEFAULT_FEATURIZER = {
        "atom_one_hot": atom_one_hot,
        "atom_degree_one_hot": atom_degree_one_hot,
        "atom_implicit_valence_one_hot": atom_implicit_valence_one_hot,
        "atom_formal_charge": atom_formal_charge,
        "atom_num_radical_electrons": atom_num_radical_electrons,
        "atom_hybridization_one_hot": partial(
            atom_hybridization_one_hot, allowable_set=DGLLIFE_HYBRIDIZATION_LIST
        ),
        "atom_is_aromatic": atom_is_aromatic,
        "atom_total_num_H_one_hot": atom_total_num_H_one_hot,
    }

    def _concat(self, data_dict: Dict[str, Iterable]):
        """Concatenate the data into a single value

        Args:
            data_dict: mapping of feature names to tensor/arrays
        Returns:
            concatenated_dict: a dict with a single key where all array have been concatenated
        """
        out = concat_dict(data_dict, new_name=self.name, order=list(self.featurizer_funcs.keys()))
        return out

DGLWeaveAtomCalculator

Bases: DGLCanonicalAtomCalculator

Default atom featurizer used by WeaveNet in DGLLife

Source code in molfeat/calc/atom.py
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
class DGLWeaveAtomCalculator(DGLCanonicalAtomCalculator):
    """Default atom featurizer used by WeaveNet in DGLLife"""

    DEFAULT_FEATURIZER = {
        "atom_one_hot": partial(
            atom_one_hot, allowable_set=DGLLIFE_WEAVE_ATOMS, encode_unknown=True
        ),
        "atom_chiral_tag_one_hot": partial(
            atom_chiral_tag_one_hot, allowable_set=DGLLIFE_WEAVE_CHIRAL_TYPES
        ),
        "atom_formal_charge": atom_formal_charge,
        "atom_partial_charge": atom_partial_charge,
        "atom_is_aromatic": atom_is_aromatic,
        "atom_hybridization_one_hot": partial(
            atom_hybridization_one_hot, allowable_set=DGLLIFE_HYBRIDIZATION_LIST[:3]
        ),
    }

    def __init__(self, concat: bool = True, name: str = "hv"):
        featurizer_funcs = self.DEFAULT_FEATURIZER
        featurizer_funcs["atom_weavenet_props"] = self.atom_weave_props
        super().__init__(concat=concat, name=name, featurizer_funcs=featurizer_funcs)

    def _get_atom_state_info(self, feats):
        """Get atom Donor/Acceptor state information from chemical pharmacophore features

        Args:
            feats: computed chemical features
        """
        is_donor = defaultdict(bool)
        is_acceptor = defaultdict(bool)
        # Get hydrogen bond donor/acceptor information
        for feats in feats:
            if feats.GetFamily() == "Donor":
                nodes = feats.GetAtomIds()
                for u in nodes:
                    is_donor[u] = True
            elif feats.GetFamily() == "Acceptor":
                nodes = feats.GetAtomIds()
                for u in nodes:
                    is_acceptor[u] = True
        return is_donor, is_acceptor

    @staticmethod
    @lru_cache(maxsize=None)
    def _feat_factory_cache():
        """Build and cache chemical features caching for speed"""
        fdef_name = os.path.join(RDConfig.RDDataDir, "BaseFeatures.fdef")
        chem_feats = ChemicalFeatures.BuildFeatureFactory(fdef_name)
        return chem_feats

    @lru_cache
    def _compute_weave_net_properties(self, mol: dm.Mol):
        # Get information for donor and acceptor
        chem_feats = self._feat_factory_cache()
        mol_feats = chem_feats.GetFeaturesForMol(mol)
        is_donor, is_acceptor = self._get_atom_state_info(mol_feats)
        sssr = GetSymmSSSR(mol)
        num_atoms = mol.GetNumAtoms()
        atom_features = []
        for i in range(num_atoms):
            cur_atom_props = [float(is_donor[i]), float(is_acceptor[i])]
            # Count the number of rings the atom belongs to for ring size between 3 and 8
            count = [0 for _ in range(3, 9)]
            for ring in sssr:
                ring_size = len(ring)
                if i in ring and 3 <= ring_size <= 8:
                    count[ring_size - 3] += 1
            cur_atom_props.extend(count)
            atom_features.append(cur_atom_props)
        return atom_features

    def atom_weave_props(self, atom: dm.Atom):
        """Get the WeaveNet properties for an atom"""
        mol = atom.GetOwningMol()
        feats = self._compute_weave_net_properties(mol)
        return feats[atom.GetIdx()]

    def __call__(self, mol: Union[dm.Mol, str], dtype: Callable = None):
        """
        Get rdkit basic descriptors for a molecule

        Args:
            mol: the molecule of interest
            dtype: requested data type

        Returns:
            dict:  For each function in self.featurizer_funcs with the key ``k``, store the computed feature under the key ``k``.
        """
        AllChem.ComputeGasteigerCharges(mol)
        return super().__call__(
            mol,
            dtype,
        )

__call__(mol, dtype=None)

Get rdkit basic descriptors for a molecule

Parameters:

Name Type Description Default
mol Union[Mol, str]

the molecule of interest

required
dtype Callable

requested data type

None

Returns:

Name Type Description
dict

For each function in self.featurizer_funcs with the key k, store the computed feature under the key k.

Source code in molfeat/calc/atom.py
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
def __call__(self, mol: Union[dm.Mol, str], dtype: Callable = None):
    """
    Get rdkit basic descriptors for a molecule

    Args:
        mol: the molecule of interest
        dtype: requested data type

    Returns:
        dict:  For each function in self.featurizer_funcs with the key ``k``, store the computed feature under the key ``k``.
    """
    AllChem.ComputeGasteigerCharges(mol)
    return super().__call__(
        mol,
        dtype,
    )

atom_weave_props(atom)

Get the WeaveNet properties for an atom

Source code in molfeat/calc/atom.py
353
354
355
356
357
def atom_weave_props(self, atom: dm.Atom):
    """Get the WeaveNet properties for an atom"""
    mol = atom.GetOwningMol()
    feats = self._compute_weave_net_properties(mol)
    return feats[atom.GetIdx()]

Bonds Featurizer

BondCalculator

Bases: SerializableCalculator

A class for bond featurizer which loops over all bonds in a molecule and featurizes them with the featurizer_funcs. The constructed graph is assumed to be a bi-directed graph by default.

Source code in molfeat/calc/bond.py
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
class BondCalculator(SerializableCalculator):
    """
    A class for bond featurizer which loops over all bonds in a molecule and
    featurizes them with the ``featurizer_funcs``. The constructed graph is assumed to be
    a bi-directed graph by default.
    """

    DEFAULT_FEATURIZER = {
        "bond_type_one_hot": bond_type_one_hot,
        "bond_stereo_one_hot": bond_stereo_one_hot,
        "bond_is_in_ring": bond_is_in_ring,
        "bond_is_conjugated": bond_is_conjugated,
        "bond_direction_one_hot": bond_direction_one_hot,
    }

    def __init__(
        self,
        featurizer_funcs: Union[list, dict] = None,
        self_loop: bool = False,
        concat: bool = True,
        name: str = "he",
    ):
        """
        Init function of the bond property calculator

        Args:
            featurizer_funcs: Mapping feature name to the featurization function.
            self_loop: Whether self loops will be added. Default to False. If True, an additional
                column of binary values to indicate the identity of self loops will be added.
                The other features of the self loops will be zero.
            concat: Whether to concat all the data into a single value in the output dict
            name: Name of the key name of the concatenated features
        """
        self._input_kwargs = locals().copy()
        self._input_kwargs.pop("self")
        # remove featurizer_funcs too
        self._input_kwargs.pop("featurizer_funcs", None)
        self._toy_mol = dm.to_mol("CO")
        self._feat_sizes = dict()
        if featurizer_funcs is None:
            featurizer_funcs = self.DEFAULT_FEATURIZER
        if not isinstance(featurizer_funcs, dict):
            get_name = lambda x: getattr(x, "__name__", repr(x))
            featurizer_funcs = dict((get_name(x), x) for x in featurizer_funcs)
        self.featurizer_funcs = featurizer_funcs
        self._self_loop = self_loop
        self.concat = concat
        self.name = name
        for k in self.featurizer_funcs.keys():
            self.feat_size(feat_name=k)
        if self._self_loop:
            self._feat_sizes["self_loop"] = 1

    def to_state_dict(self):
        """Convert the Atom calculator to a state dict
        Due to some constraints and cross-version compatibility,  the featurizer functions
        need to be pickled and not just list
        """
        state_dict = {}
        state_dict["name"] = self.__class__.__name__
        state_dict["module"] = self.__class__.__module__
        state_dict["args"] = self._input_kwargs

        featurizer_fn_pickled = {}
        for fname, ffunc in self.featurizer_funcs.items():
            featurizer_fn_pickled[fname] = fn_to_hex(ffunc)
        state_dict["args"]["featurizer_funcs"] = featurizer_fn_pickled
        state_dict["_molfeat_version"] = MOLFEAT_VERSION
        signature = inspect.signature(self.__init__)
        val = {
            k: v.default
            for k, v in signature.parameters.items()
            #    if v.default is not inspect.Parameter.empty
        }
        to_remove = [k for k in state_dict["args"] if k not in val.keys()]
        for k in to_remove:
            state_dict["args"].pop(k)
        return state_dict

    @classmethod
    def from_state_dict(cls, state_dict, override_args: Optional[dict] = None):
        """Create an instance of an atom calculator from a state dict

        Args:
            state_dict: state dictionary to use to create the atom calculator
            override_args: optional dictionary of arguments to override the ones in the state dict
                at construction of the new object
        """
        # EN: at this moment, version compatibility is not enforced
        cls_name = state_dict.get("name", cls.__name__)
        module_name = state_dict.get("module", cls.__module__)
        module = importlib.import_module(module_name)
        klass = getattr(module, cls_name)

        kwargs = state_dict["args"].copy()
        # now we need to unpickle the featurizer functions
        featurizer_fn_pickled = kwargs.pop("featurizer_funcs", None)
        if featurizer_fn_pickled is not None:
            featurizer_fn_loaded = {}
            for k, v in featurizer_fn_pickled.items():
                featurizer_fn_loaded[k] = hex_to_fn(v)
            kwargs["featurizer_funcs"] = featurizer_fn_loaded
        kwargs.update(**(override_args or {}))
        return klass(**kwargs)

    def _concat(self, data_dict: Dict[str, Iterable]):
        """Concatenate the data into a single value

        Args:
            data_dict: mapping of feature names to tensor/arrays
        Returns:
            concatenated_dict: a dict with a single key where all array have been concatenated
        """
        return concat_dict(data_dict, new_name=self.name)

    def feat_size(self, feat_name: Optional[str] = None):
        """Get the feature size for ``feat_name``.

        When there is only one feature, ``feat_name`` can be None.

        Args:
            feat_name: Feature for query.

        Returns:
            int: Feature size for the feature with name ``feat_name``. Default to None.
        """
        if feat_name is None:
            assert (
                len(self.featurizer_funcs) == 1
            ), "feat_name should be provided if there are more than one features"
            feat_name = list(self.featurizer_funcs.keys())[0]

        if feat_name not in self.featurizer_funcs:
            raise ValueError(
                "Expect feat_name to be in {}, got {}".format(
                    list(self.featurizer_funcs.keys()), feat_name
                )
            )
        if feat_name not in self._feat_sizes:
            bond = self._toy_mol.GetBondWithIdx(0)
            self._feat_sizes[feat_name] = len(self.featurizer_funcs[feat_name](bond))
        return self._feat_sizes[feat_name]

    def __len__(self):
        """Get length of the property estimator"""
        return sum(v for k, v in self._feat_sizes.items() if k != self.name)

    def __call__(self, mol: Union[dm.Mol, str], dtype: Callable = None, **kwargs):
        """Featurize all bonds in a molecule.

        Args:
            mol: the molecule of interest
            dtype: requested data type

        Returns:
            dict: For each function in self.featurizer_funcs with the key ``k``,
                store the computed feature under the key ``k``.
        """
        mol = dm.to_mol(mol)
        num_bonds = mol.GetNumBonds()
        bond_features = defaultdict(list)

        # Compute features for each bond
        for i in range(num_bonds):
            bond = mol.GetBondWithIdx(i)
            for feat_name, feat_func in self.featurizer_funcs.items():
                feat = feat_func(bond)
                bond_features[feat_name].extend([feat, feat.copy()])

        # Stack the features and convert them to float arrays
        processed_features = dict()
        for feat_name, feat_list in bond_features.items():
            feat = np.stack(feat_list)
            processed_features[feat_name] = feat

        if self._self_loop and num_bonds > 0:
            num_atoms = mol.GetNumAtoms()
            for feat_name in processed_features:
                feats = processed_features[feat_name]
                # add a new label that says the feat are not self loop
                # feats = np.concatenate([feats, np.zeros((feats.shape[0], 1))], axis=1)
                # add a label at the last position that says it's a selfloop
                add_edges = np.zeros((num_atoms, feats.shape[1]))
                # self_loop_feats[:, -1] = 1
                feats = np.concatenate([feats, add_edges], axis=0)
                processed_features[feat_name] = feats
            self_loop_feats = np.concatenate(
                [np.zeros((num_bonds * 2, 1)), np.ones((num_atoms, 1))]
            )

            processed_features["self_loop"] = self_loop_feats

        if self._self_loop and num_bonds == 0:
            num_atoms = mol.GetNumAtoms()
            old_concat = self.concat
            self.concat = False
            processed_features = self(self._toy_mol)
            self.concat = old_concat
            for feat_name in processed_features:
                feats = processed_features[feat_name]
                feats = np.zeros((num_atoms, feats.shape[1]))
                processed_features[feat_name] = feats
        if self.concat and (num_bonds > 0 or self._self_loop):
            processed_features = self._concat(processed_features)
        if dtype is not None:
            for feat_name, feat in processed_features.items():
                feat = datatype.cast(feat, dtype=dtype)
                processed_features[feat_name] = feat

        return processed_features

__call__(mol, dtype=None, **kwargs)

Featurize all bonds in a molecule.

Parameters:

Name Type Description Default
mol Union[Mol, str]

the molecule of interest

required
dtype Callable

requested data type

None

Returns:

Name Type Description
dict

For each function in self.featurizer_funcs with the key k, store the computed feature under the key k.

Source code in molfeat/calc/bond.py
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
def __call__(self, mol: Union[dm.Mol, str], dtype: Callable = None, **kwargs):
    """Featurize all bonds in a molecule.

    Args:
        mol: the molecule of interest
        dtype: requested data type

    Returns:
        dict: For each function in self.featurizer_funcs with the key ``k``,
            store the computed feature under the key ``k``.
    """
    mol = dm.to_mol(mol)
    num_bonds = mol.GetNumBonds()
    bond_features = defaultdict(list)

    # Compute features for each bond
    for i in range(num_bonds):
        bond = mol.GetBondWithIdx(i)
        for feat_name, feat_func in self.featurizer_funcs.items():
            feat = feat_func(bond)
            bond_features[feat_name].extend([feat, feat.copy()])

    # Stack the features and convert them to float arrays
    processed_features = dict()
    for feat_name, feat_list in bond_features.items():
        feat = np.stack(feat_list)
        processed_features[feat_name] = feat

    if self._self_loop and num_bonds > 0:
        num_atoms = mol.GetNumAtoms()
        for feat_name in processed_features:
            feats = processed_features[feat_name]
            # add a new label that says the feat are not self loop
            # feats = np.concatenate([feats, np.zeros((feats.shape[0], 1))], axis=1)
            # add a label at the last position that says it's a selfloop
            add_edges = np.zeros((num_atoms, feats.shape[1]))
            # self_loop_feats[:, -1] = 1
            feats = np.concatenate([feats, add_edges], axis=0)
            processed_features[feat_name] = feats
        self_loop_feats = np.concatenate(
            [np.zeros((num_bonds * 2, 1)), np.ones((num_atoms, 1))]
        )

        processed_features["self_loop"] = self_loop_feats

    if self._self_loop and num_bonds == 0:
        num_atoms = mol.GetNumAtoms()
        old_concat = self.concat
        self.concat = False
        processed_features = self(self._toy_mol)
        self.concat = old_concat
        for feat_name in processed_features:
            feats = processed_features[feat_name]
            feats = np.zeros((num_atoms, feats.shape[1]))
            processed_features[feat_name] = feats
    if self.concat and (num_bonds > 0 or self._self_loop):
        processed_features = self._concat(processed_features)
    if dtype is not None:
        for feat_name, feat in processed_features.items():
            feat = datatype.cast(feat, dtype=dtype)
            processed_features[feat_name] = feat

    return processed_features

__init__(featurizer_funcs=None, self_loop=False, concat=True, name='he')

Init function of the bond property calculator

Parameters:

Name Type Description Default
featurizer_funcs Union[list, dict]

Mapping feature name to the featurization function.

None
self_loop bool

Whether self loops will be added. Default to False. If True, an additional column of binary values to indicate the identity of self loops will be added. The other features of the self loops will be zero.

False
concat bool

Whether to concat all the data into a single value in the output dict

True
name str

Name of the key name of the concatenated features

'he'
Source code in molfeat/calc/bond.py
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
def __init__(
    self,
    featurizer_funcs: Union[list, dict] = None,
    self_loop: bool = False,
    concat: bool = True,
    name: str = "he",
):
    """
    Init function of the bond property calculator

    Args:
        featurizer_funcs: Mapping feature name to the featurization function.
        self_loop: Whether self loops will be added. Default to False. If True, an additional
            column of binary values to indicate the identity of self loops will be added.
            The other features of the self loops will be zero.
        concat: Whether to concat all the data into a single value in the output dict
        name: Name of the key name of the concatenated features
    """
    self._input_kwargs = locals().copy()
    self._input_kwargs.pop("self")
    # remove featurizer_funcs too
    self._input_kwargs.pop("featurizer_funcs", None)
    self._toy_mol = dm.to_mol("CO")
    self._feat_sizes = dict()
    if featurizer_funcs is None:
        featurizer_funcs = self.DEFAULT_FEATURIZER
    if not isinstance(featurizer_funcs, dict):
        get_name = lambda x: getattr