`Calculators`¶

`get_calculator(name, **params)` ¶

Get molecular calculator based on name

Parameters:

Name	Type	Description	Default
`name`	`str`	Name of the featurizer	required
`params`	`dict`	Parameters of the featurizer	`{}`

Raises:

Type	Description
`ValueError`	When featurizer is not supported

Returns:

Name	Type	Description
`featurizer`		Callable

Source code in molfeat/calc/__init__.py

def get_calculator(name: str, **params):
    """Get molecular calculator based on name

    Args:
        name: Name of the featurizer
        params (dict): Parameters of the featurizer

    Raises:
        ValueError: When featurizer is not supported

    Returns:
        featurizer: Callable
    """
    if not isinstance(name, str):
        return name

    CALC_MAP = {k.lower(): v for k, v in _CALCULATORS.items()}
    name = name.lower()
    if name in FP_FUNCS.keys():
        featurizer = FPCalculator(name, **params)
    elif name == "desc3d":
        featurizer = RDKitDescriptors3D(**params)
    elif name == "desc2d":
        featurizer = RDKitDescriptors2D(**params)
    elif name == "mordred":
        featurizer = MordredDescriptors(**params)
    elif name == "cats":
        featurizer = CATS(**params)
    elif name == "cats2d":
        params["use_3d_distances"] = False
        featurizer = CATS(**params)
    elif name == "cats3d":
        params["use_3d_distances"] = True
        featurizer = CATS(**params)
    elif name == "pharm2d":
        featurizer = Pharmacophore2D(**params)
    elif name == "pharm3d":
        featurizer = Pharmacophore3D(**params)
    elif name.startswith("usr"):
        params["method"] = name
        featurizer = USRDescriptors(**params)
    elif name == "electroshape":
        featurizer = ElectroShapeDescriptors(**params)
    elif name in ["scaffoldkeys", "skeys", "scaffkeys"]:
        featurizer = ScaffoldKeyCalculator(**params)
    elif name == "none":
        featurizer = None
    # for any generic calculator that has been automatically registered
    elif name in CALC_MAP.keys():
        featurizer = CALC_MAP[name](**params)
    else:
        raise ValueError(f"{name} is not a supported internal featurizer")
    return featurizer

`Fingerprints`¶

FP_DEF_PARAMS = {'maccs': {}, 'avalon': {'nBits': 512, 'isQuery': False, 'resetVect': False, 'bitFlags': pyAvalonTools.avalonSimilarityBits}, 'ecfp': {'radius': 2, 'nBits': 2048, 'invariants': [], 'fromAtoms': [], 'useChirality': False, 'useBondTypes': True, 'useFeatures': False}, 'fcfp': {'radius': 2, 'nBits': 2048, 'invariants': [], 'fromAtoms': [], 'useChirality': False, 'useBondTypes': True, 'useFeatures': True}, 'topological': {'nBits': 2048, 'targetSize': 4, 'fromAtoms': 0, 'ignoreAtoms': 0, 'atomInvariants': 0, 'nBitsPerEntry': 4, 'includeChirality': False}, 'atompair': {'nBits': 2048, 'minLength': 1, 'maxLength': 30, 'fromAtoms': 0, 'ignoreAtoms': 0, 'atomInvariants': 0, 'nBitsPerEntry': 4, 'includeChirality': False, 'use2D': True, 'confId': -1}, 'rdkit': {'minPath': 1, 'maxPath': 7, 'fpSize': 2048, 'nBitsPerHash': 2, 'useHs': True, 'tgtDensity': 0.0, 'minSize': 128, 'branchedPaths': True, 'useBondOrder': True, 'atomInvariants': 0, 'fromAtoms': 0, 'atomBits': None, 'bitInfo': None}, 'pattern': {'fpSize': 2048, 'atomCounts': [], 'setOnlyBits': None}, 'layered': {'fpSize': 2048, 'minPath': 1, 'maxPath': 7, 'atomCounts': [], 'setOnlyBits': None, 'branchedPaths': True, 'fromAtoms': 0}, 'map4': {'dimensions': 2048, 'radius': 2}, 'secfp': {'n_permutations': 128, 'nBits': 2048, 'radius': 3, 'min_radius': 1, 'rings': True, 'kekulize': False, 'isomeric': False, 'seed': 0}, 'mhfp': {'n_permutations': 128, 'radius': 3, 'min_radius': 1, 'rings': True, 'kekulize': False, 'isomeric': False, 'seed': 0}, 'erg': {'atomTypes': 0, 'fuzzIncrement': 0.3, 'minPath': 1, 'maxPath': 15}, 'estate': {}, 'ecfp-count': {'radius': 2, 'nBits': 2048, 'invariants': [], 'fromAtoms': [], 'useChirality': False, 'useBondTypes': True, 'useFeatures': False, 'includeRedundantEnvironments': False}, 'fcfp-count': {'radius': 2, 'nBits': 2048, 'invariants': [], 'fromAtoms': [], 'useChirality': False, 'useBondTypes': True, 'useFeatures': True, 'includeRedundantEnvironments': False}, 'topological-count': {'nBits': 2048, 'targetSize': 4, 'fromAtoms': 0, 'ignoreAtoms': 0, 'atomInvariants': 0, 'includeChirality': False}, 'avalon-count': {'nBits': 512, 'isQuery': False, 'bitFlags': pyAvalonTools.avalonSimilarityBits}, 'rdkit-count': {'minPath': 1, 'maxPath': 7, 'useHs': True, 'branchedPaths': True, 'useBondOrder': True, 'atomInvariants': 0, 'fromAtoms': 0, 'atomBits': None, 'bitInfo': None}, 'atompair-count': {'nBits': 2048, 'minLength': 1, 'maxLength': 30, 'fromAtoms': 0, 'ignoreAtoms': 0, 'atomInvariants': 0, 'includeChirality': False, 'use2D': True, 'confId': -1}} `module-attribute` ¶

FP_FUNCS = {'maccs': rdMolDescriptors.GetMACCSKeysFingerprint, 'avalon': pyAvalonTools.GetAvalonFP, 'ecfp': rdMolDescriptors.GetMorganFingerprintAsBitVect, 'fcfp': partial(rdMolDescriptors.GetMorganFingerprintAsBitVect, useFeatures=True), 'topological': rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect, 'atompair': rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect, 'rdkit': rdmolops.RDKFingerprint, 'pattern': rdmolops.PatternFingerprint, 'layered': rdmolops.LayeredFingerprint, 'map4': MAP4, 'secfp': SECFP, 'erg': rdReducedGraphs.GetErGFingerprint, 'estate': lambda x: EStateFingerprinter.FingerprintMol(x)[0], 'avalon-count': pyAvalonTools.GetAvalonCountFP, 'rdkit-count': rdmolops.UnfoldedRDKFingerprintCountBased, 'ecfp-count': rdMolDescriptors.GetHashedMorganFingerprint, 'fcfp-count': rdMolDescriptors.GetHashedMorganFingerprint, 'topological-count': rdMolDescriptors.GetHashedTopologicalTorsionFingerprint, 'atompair-count': rdMolDescriptors.GetHashedAtomPairFingerprint} `module-attribute` ¶

`FPCalculator` ¶

Bases: SerializableCalculator

Fingerprint bit calculator for a molecule

Source code in molfeat/calc/fingerprints.py

class FPCalculator(SerializableCalculator):
    """Fingerprint bit calculator for a molecule"""

    def __init__(
        self,
        method: str,
        length: Optional[int] = None,
        counting: bool = False,
        **kwargs,
    ):
        """Compute the given fingeprint for a molecule

        !!! note
            For efficiency reason, count fingerprints are hashed and potentially
            re-folded and the count corresponds to the number of bits set to true

        Args:
            method (str): Name of the fingerprint method to use
            length (int, optional): Length of the fingerprint. Defaults to None.
                The default corresponds to the fingerpint default.
            counting (bool, optional): Whether to use the count version of the fingerprint
            kwargs (dict): any parameters to the fingerprint algorithm
        """
        self.method = method.lower()
        self.counting = counting or "-count" in self.method
        if self.counting and not "-count" in self.method:
            self.method = self.method + "-count"
        self.input_length = length
        if self.method not in FP_FUNCS:
            raise ValueError(f"Method {self.method} is not a supported featurizer")
        default_params = copy.deepcopy(FP_DEF_PARAMS[method])
        unknown_params = set(kwargs.keys()).difference(set(default_params.keys()))
        if unknown_params:
            logger.error(f"Params: {unknown_params} are not valid for {method}")
        self.params = default_params
        self.params.update({k: kwargs[k] for k in kwargs if k in default_params.keys()})
        self._length = self._set_length(length)

    @property
    def columns(self):
        """
        Get the name of all the descriptors of this calculator
        """
        return [f"fp_{i}" for i in range(self._length)]

    def __len__(self):
        """Return the length of the calculator"""
        return self._length

    def _set_length(self, length=None):
        """Get the length of the featurizer"""
        fplen = length
        len_key = None
        if self.method == "maccs":
            fplen = 167
        elif self.method == "estate":
            fplen = 79
        elif self.method == "erg":
            fplen = 315
        elif self.method == "rdkit-count" and not fplen:
            fplen = 2048
        elif "nBits" in self.params.keys():
            len_key = "nBits"
            fplen = self.params[len_key]
        elif "n_permutations" in self.params.keys():
            # special case for mhfp
            len_key = "n_permutations"
            fplen = self.params[len_key]
        elif "fpSize" in self.params.keys():
            len_key = "fpSize"
            fplen = self.params[len_key]
        elif "dimensions" in self.params.keys():
            len_key = "dimensions"
            fplen = self.params[len_key]
        if len_key is not None and length:
            self.params[len_key] = length
            fplen = length
        return fplen

    def __call__(self, mol: Union[rdchem.Mol, str], raw: bool = False):
        r"""
        Compute the Fingerprint of a molecule

        Args:
            mol: the molecule of interest
            raw: whether to keep original datatype or convert to numpy. Useful for rdkit's similarity functions

        Returns:
            props (np.ndarray): list of computed rdkit molecular descriptors
        """
        mol = dm.to_mol(mol)
        fp_val = FP_FUNCS[self.method](mol, **self.params)
        if self.counting:
            fp_val = fold_count_fp(fp_val, self._length)
        if not raw:
            fp_val = to_numpy(fp_val)
        if self.counting and raw:
            # converint the counted values to SparseInt again
            fp_val = to_fp(fp_val, bitvect=False)
        return fp_val

    def __getstate__(self):
        # EN: note that the state is standardized with all the parameter
        # because of the possibility of default changing after
        state = {}
        state["length"] = self.input_length
        state["input_length"] = self.input_length
        state["method"] = self.method
        state["counting"] = self.counting
        state["params"] = self.params
        return state

    def __setstate__(self, state: dict):
        """Set the state of the featurizer"""
        self.__dict__.update(state)
        self._length = self._set_length(self.input_length)

    def to_state_dict(self):
        """Get the state dictionary"""
        state_dict = super().to_state_dict()
        cur_params = self.params
        default_params = copy.deepcopy(FP_DEF_PARAMS[state_dict["args"]["method"]])
        state_dict["args"].update(
            {
                k: cur_params[k]
                for k in cur_params
                if (cur_params[k] != default_params[k] and cur_params[k] is not None)
            }
        )
        # we want to keep all the additional parameters in the state dict
        return state_dict

`columns` `property` ¶

Get the name of all the descriptors of this calculator

`counting = counting or '-count' in self.method` `instance-attribute` ¶

`input_length = length` `instance-attribute` ¶

`method = method.lower()` `instance-attribute` ¶

`params = default_params` `instance-attribute` ¶

`call(mol, raw=False)` ¶

Compute the Fingerprint of a molecule

Parameters:

Name	Type	Description	Default
`mol`	`Union[rdchem.Mol, str]`	the molecule of interest	required
`raw`	`bool`	whether to keep original datatype or convert to numpy. Useful for rdkit's similarity functions	`False`

Returns:

Name	Type	Description
`props`	`np.ndarray`	list of computed rdkit molecular descriptors

Source code in molfeat/calc/fingerprints.py

def __call__(self, mol: Union[rdchem.Mol, str], raw: bool = False):
    r"""
    Compute the Fingerprint of a molecule

    Args:
        mol: the molecule of interest
        raw: whether to keep original datatype or convert to numpy. Useful for rdkit's similarity functions

    Returns:
        props (np.ndarray): list of computed rdkit molecular descriptors
    """
    mol = dm.to_mol(mol)
    fp_val = FP_FUNCS[self.method](mol, **self.params)
    if self.counting:
        fp_val = fold_count_fp(fp_val, self._length)
    if not raw:
        fp_val = to_numpy(fp_val)
    if self.counting and raw:
        # converint the counted values to SparseInt again
        fp_val = to_fp(fp_val, bitvect=False)
    return fp_val

`getstate()` ¶

Source code in molfeat/calc/fingerprints.py

def __getstate__(self):
    # EN: note that the state is standardized with all the parameter
    # because of the possibility of default changing after
    state = {}
    state["length"] = self.input_length
    state["input_length"] = self.input_length
    state["method"] = self.method
    state["counting"] = self.counting
    state["params"] = self.params
    return state

`init(method, length=None, counting=False, **kwargs)` ¶

Compute the given fingeprint for a molecule

Note

For efficiency reason, count fingerprints are hashed and potentially re-folded and the count corresponds to the number of bits set to true

Parameters:

Name	Type	Description	Default
`method`	`str`	Name of the fingerprint method to use	required
`length`	`int`	Length of the fingerprint. Defaults to None. The default corresponds to the fingerpint default.	`None`
`counting`	`bool`	Whether to use the count version of the fingerprint	`False`
`kwargs`	`dict`	any parameters to the fingerprint algorithm	`{}`

Source code in molfeat/calc/fingerprints.py

def __init__(
    self,
    method: str,
    length: Optional[int] = None,
    counting: bool = False,
    **kwargs,
):
    """Compute the given fingeprint for a molecule

    !!! note
        For efficiency reason, count fingerprints are hashed and potentially
        re-folded and the count corresponds to the number of bits set to true

    Args:
        method (str): Name of the fingerprint method to use
        length (int, optional): Length of the fingerprint. Defaults to None.
            The default corresponds to the fingerpint default.
        counting (bool, optional): Whether to use the count version of the fingerprint
        kwargs (dict): any parameters to the fingerprint algorithm
    """
    self.method = method.lower()
    self.counting = counting or "-count" in self.method
    if self.counting and not "-count" in self.method:
        self.method = self.method + "-count"
    self.input_length = length
    if self.method not in FP_FUNCS:
        raise ValueError(f"Method {self.method} is not a supported featurizer")
    default_params = copy.deepcopy(FP_DEF_PARAMS[method])
    unknown_params = set(kwargs.keys()).difference(set(default_params.keys()))
    if unknown_params:
        logger.error(f"Params: {unknown_params} are not valid for {method}")
    self.params = default_params
    self.params.update({k: kwargs[k] for k in kwargs if k in default_params.keys()})
    self._length = self._set_length(length)

`len()` ¶

Return the length of the calculator

Source code in molfeat/calc/fingerprints.py

def __len__(self):
    """Return the length of the calculator"""
    return self._length

`setstate(state)` ¶

Set the state of the featurizer

Source code in molfeat/calc/fingerprints.py

def __setstate__(self, state: dict):
    """Set the state of the featurizer"""
    self.__dict__.update(state)
    self._length = self._set_length(self.input_length)

`to_state_dict()` ¶

Get the state dictionary

Source code in molfeat/calc/fingerprints.py

def to_state_dict(self):
    """Get the state dictionary"""
    state_dict = super().to_state_dict()
    cur_params = self.params
    default_params = copy.deepcopy(FP_DEF_PARAMS[state_dict["args"]["method"]])
    state_dict["args"].update(
        {
            k: cur_params[k]
            for k in cur_params
            if (cur_params[k] != default_params[k] and cur_params[k] is not None)
        }
    )
    # we want to keep all the additional parameters in the state dict
    return state_dict

`Descriptors`¶

`MordredDescriptors` ¶

Bases: SerializableCalculator

Compute mordred descriptors. The descriptor calculator does not mask errors in featurization and will propagate them.

Note

Mordred descriptors can results in undefined or nan behaviour depending on your input molecule. It is recommended that the user handles those nan values himself by either removing the descriptor or imputing the missing values.

Source code in molfeat/calc/descriptors.py

class MordredDescriptors(SerializableCalculator):
    r"""
    Compute mordred descriptors.
    The descriptor calculator does not mask errors in featurization and will propagate them.

    !!! note
        Mordred descriptors can results in undefined or nan behaviour depending on your input molecule.
        It is recommended that the user handles those nan values himself by either removing the descriptor
        or imputing the missing values.
    """

    def __init__(
        self,
        ignore_3D: bool = True,
        replace_nan: bool = False,
        do_not_standardize: bool = False,
        **kwargs,
    ):
        """Mordred descriptor computation

        Args:
            ignore_3D (bool, optional): Whether to ignore 3D descriptors or include them
            replace_nan (bool, optional): Whether to replace nan or infinite values. Defaults to False.
            do_not_standardize: Whether to force standardize molecules or keep it the same

        """
        if not requires.check("mordred"):
            logger.error(
                "`mordred` is not available, please install it `pip install 'mordred[full]'`"
            )
            raise ImportError("Cannot import `mordred`")
        self.replace_nan = replace_nan
        self.ignore_3D = ignore_3D
        self.do_not_standardize = do_not_standardize
        self._calc = None
        self._init_calc()

    def _init_calc(self):
        """Initialize mordred calculator"""
        self._calc = MordredCalculator(mordred_descriptors, ignore_3D=self.ignore_3D)

    @property
    def columns(self):
        """
        Get the name of all the descriptors of this calculator
        """
        return [str(x) for x in self._calc.descriptors]

    def __len__(self):
        """Return the length of the calculator"""
        return len(self._calc)

    def __getstate__(self):
        """Serialize the class for pickling."""
        state = {}
        state["ignore_3D"] = self.ignore_3D
        state["replace_nan"] = self.replace_nan
        state["do_not_standardize"] = getattr(self, "do_not_standardize", False)
        return state

    def __setstate__(self, state: dict):
        """Reload the class from pickling."""
        self.__dict__.update(state)
        self._init_calc()

    def __call__(self, mol: Union[rdchem.Mol, str], conformer_id: Optional[int] = -1):
        r"""
        Get rdkit basic descriptors for a molecule

        Args:
            mol: the molecule of interest
            conformer_id (int, optional): Optional

        Returns:
            props (np.ndarray): list of computed mordred molecular descriptors
        """
        mol = dm.to_mol(mol)
        vals = self._calc(mol, conformer_id).fill_missing()
        vals = to_numpy(vals)
        if self.replace_nan:
            vals = np.nan_to_num(vals)
        return vals

`columns` `property` ¶

Get the name of all the descriptors of this calculator

`do_not_standardize = do_not_standardize` `instance-attribute` ¶

`ignore_3D = ignore_3D` `instance-attribute` ¶

`replace_nan = replace_nan` `instance-attribute` ¶

`call(mol, conformer_id=-1)` ¶

Get rdkit basic descriptors for a molecule

Parameters:

Name	Type	Description	Default
`mol`	`Union[rdchem.Mol, str]`	the molecule of interest	required
`conformer_id`	`int`	Optional	`-1`

Returns:

Name	Type	Description
`props`	`np.ndarray`	list of computed mordred molecular descriptors

Source code in molfeat/calc/descriptors.py

def __call__(self, mol: Union[rdchem.Mol, str], conformer_id: Optional[int] = -1):
    r"""
    Get rdkit basic descriptors for a molecule

    Args:
        mol: the molecule of interest
        conformer_id (int, optional): Optional

    Returns:
        props (np.ndarray): list of computed mordred molecular descriptors
    """
    mol = dm.to_mol(mol)
    vals = self._calc(mol, conformer_id).fill_missing()
    vals = to_numpy(vals)
    if self.replace_nan:
        vals = np.nan_to_num(vals)
    return vals

`getstate()` ¶

Serialize the class for pickling.

Source code in molfeat/calc/descriptors.py

def __getstate__(self):
    """Serialize the class for pickling."""
    state = {}
    state["ignore_3D"] = self.ignore_3D
    state["replace_nan"] = self.replace_nan
    state["do_not_standardize"] = getattr(self, "do_not_standardize", False)
    return state

`init(ignore_3D=True, replace_nan=False, do_not_standardize=False, **kwargs)` ¶

Mordred descriptor computation

Parameters:

Name	Type	Description	Default
`ignore_3D`	`bool`	Whether to ignore 3D descriptors or include them	`True`
`replace_nan`	`bool`	Whether to replace nan or infinite values. Defaults to False.	`False`
`do_not_standardize`	`bool`	Whether to force standardize molecules or keep it the same	`False`

Source code in molfeat/calc/descriptors.py

def __init__(
    self,
    ignore_3D: bool = True,
    replace_nan: bool = False,
    do_not_standardize: bool = False,
    **kwargs,
):
    """Mordred descriptor computation

    Args:
        ignore_3D (bool, optional): Whether to ignore 3D descriptors or include them
        replace_nan (bool, optional): Whether to replace nan or infinite values. Defaults to False.
        do_not_standardize: Whether to force standardize molecules or keep it the same

    """
    if not requires.check("mordred"):
        logger.error(
            "`mordred` is not available, please install it `pip install 'mordred[full]'`"
        )
        raise ImportError("Cannot import `mordred`")
    self.replace_nan = replace_nan
    self.ignore_3D = ignore_3D
    self.do_not_standardize = do_not_standardize
    self._calc = None
    self._init_calc()

`len()` ¶

Return the length of the calculator

Source code in molfeat/calc/descriptors.py

def __len__(self):
    """Return the length of the calculator"""
    return len(self._calc)

`setstate(state)` ¶

Reload the class from pickling.

Source code in molfeat/calc/descriptors.py

def __setstate__(self, state: dict):
    """Reload the class from pickling."""
    self.__dict__.update(state)
    self._init_calc()

`RDKitDescriptors2D` ¶

Bases: SerializableCalculator

Compute a list of available rdkit 2D descriptors for a molecule. The descriptor calculator does not mask errors in featurization and will propagate them

Source code in molfeat/calc/descriptors.py

class RDKitDescriptors2D(SerializableCalculator):
    r"""
    Compute a list of available  rdkit 2D descriptors for a molecule.
    The descriptor calculator does not mask errors in featurization and will propagate them
    """

    DESCRIPTORS_FN = {name: fn for (name, fn) in Descriptors.descList}

    def __init__(
        self,
        replace_nan: Optional[bool] = False,
        augment: Optional[bool] = True,
        descrs: List = None,
        avg_ipc: Optional[bool] = True,
        do_not_standardize: Optional[bool] = False,
        **kwargs,
    ):
        """RDKit descriptor computation

        Args:
            replace_nan: Whether to replace nan or infinite values. Defaults to False.
            augment: Whether to augment the descriptors with some additional custom features
            descrs: Subset of available features to consider if not None
            avg_ipc: Whether to average IPC values or to use rdkit original
            do_not_standardize: Whether to force standardization of molecule before computation of the descriptor.
                Set to True if you want molfeat<=0.5.3 behaviour
        """
        self.replace_nan = replace_nan
        self.augment = augment
        self.descrs = descrs
        self.avg_ipc = avg_ipc
        self.do_not_standardize = do_not_standardize
        all_features = [d[0] for d in Descriptors.descList]
        if self.augment:
            all_features += [
                "NumAtomStereoCenters",
                "NumUnspecifiedAtomStereoCenters",
                "NumBridgeheadAtoms",
                "NumAmideBonds",
                "NumSpiroAtoms",
                "Alerts",
            ]
        if descrs is not None:
            self._columns = [x for x in descrs if x in all_features]
            unknown_descrs = set(descrs) - set(all_features)
            if len(unknown_descrs) > 0:
                logger.warning(f"Following features are not supported: {unknown_descrs}")
        else:
            self._columns = all_features

    def __getstate__(self):
        """Serialize the class for pickling."""
        state = {}
        state["replace_nan"] = self.replace_nan
        state["augment"] = self.augment
        state["descrs"] = self.descrs
        state["_columns"] = self._columns

        # EN: set `avg_ipc` and `standardize`` default value to False for compat until next release
        state["avg_ipc"] = getattr(self, "avg_ipc", False)
        state["do_not_standardize"] = getattr(self, "do_not_standardize", False)
        return state

    def _compute_extra_features(self, mol: Union[rdchem.Mol, str]):
        """Compute the extra properties required for the augmented features version

        Args:
            mol: Input molecule

        Returns:
            props (dict): Dict of extra molecular properties
        """
        mol = copy.deepcopy(mol)
        FindMolChiralCenters(mol, force=True)
        # "NumAtomStereoCenters", "NumUnspecifiedAtomStereoCenters", "NumBridgeheadAtoms", "NumAmideBonds", "NumSpiroAtoms"
        p_obj = rdMolDescriptors.Properties()
        props = OrderedDict(zip(p_obj.GetPropertyNames(), p_obj.ComputeProperties(mol)))
        # Alerts
        qed_props = properties(mol)
        props["Alerts"] = qed_props.ALERTS
        return props

    @property
    def columns(self):
        """
        Get the name of all the descriptors of this calculator
        """
        return self._columns

    def __len__(self):
        """Return the length of the calculator"""
        return len(self._columns)

    @requires_standardization(disconnect_metals=True, remove_salt=True)
    def __call__(self, mol: Union[rdchem.Mol, str]):
        r"""
        Get rdkit basic descriptors for a molecule

        Args:
            mol: the molecule of interest

        Returns:
            props (np.ndarray): list of computed rdkit molecular descriptors
        """
        mol = dm.to_mol(mol)
        vals = []
        props = {}
        if self.augment:
            props = self._compute_extra_features(mol)
        fixed_charge_descr = _charge_descriptors_computation(mol)
        for name in self.columns:
            val = float("nan")
            if name in fixed_charge_descr:
                val = fixed_charge_descr[name]
            elif name == "Ipc" and self.avg_ipc:  # bug fix of the rdkit IPC value
                val = self.DESCRIPTORS_FN[name](mol, avg=True)
            elif name in self.DESCRIPTORS_FN:
                val = self.DESCRIPTORS_FN[name](mol)
            elif name in props:
                val = props[name]
            else:
                raise ValueError(f"Property: {name} is not supported !")
            vals.append(val)
        vals = to_numpy(vals)
        if self.replace_nan:
            vals = np.nan_to_num(vals)
        return vals

`DESCRIPTORS_FN = {name: fn for (name, fn) in Descriptors.descList}` `class-attribute` ¶

`augment = augment` `instance-attribute` ¶

`avg_ipc = avg_ipc` `instance-attribute` ¶

`columns` `property` ¶

Get the name of all the descriptors of this calculator

`descrs = descrs` `instance-attribute` ¶

`do_not_standardize = do_not_standardize` `instance-attribute` ¶

`replace_nan = replace_nan` `instance-attribute` ¶

`call(mol)` ¶

Get rdkit basic descriptors for a molecule

Parameters:

Name	Type	Description	Default
`mol`	`Union[rdchem.Mol, str]`	the molecule of interest	required

Returns:

Name	Type	Description
`props`	`np.ndarray`	list of computed rdkit molecular descriptors

Source code in molfeat/calc/descriptors.py

@requires_standardization(disconnect_metals=True, remove_salt=True)
def __call__(self, mol: Union[rdchem.Mol, str]):
    r"""
    Get rdkit basic descriptors for a molecule

    Args:
        mol: the molecule of interest

    Returns:
        props (np.ndarray): list of computed rdkit molecular descriptors
    """
    mol = dm.to_mol(mol)
    vals = []
    props = {}
    if self.augment:
        props = self._compute_extra_features(mol)
    fixed_charge_descr = _charge_descriptors_computation(mol)
    for name in self.columns:
        val = float("nan")
        if name in fixed_charge_descr:
            val = fixed_charge_descr[name]
        elif name == "Ipc" and self.avg_ipc:  # bug fix of the rdkit IPC value
            val = self.DESCRIPTORS_FN[name](mol, avg=True)
        elif name in self.DESCRIPTORS_FN:
            val = self.DESCRIPTORS_FN[name](mol)
        elif name in props:
            val = props[name]
        else:
            raise ValueError(f"Property: {name} is not supported !")
        vals.append(val)
    vals = to_numpy(vals)
    if self.replace_nan:
        vals = np.nan_to_num(vals)
    return vals

`getstate()` ¶

Serialize the class for pickling.

Source code in molfeat/calc/descriptors.py

def __getstate__(self):
    """Serialize the class for pickling."""
    state = {}
    state["replace_nan"] = self.replace_nan
    state["augment"] = self.augment
    state["descrs"] = self.descrs
    state["_columns"] = self._columns

    # EN: set `avg_ipc` and `standardize`` default value to False for compat until next release
    state["avg_ipc"] = getattr(self, "avg_ipc", False)
    state["do_not_standardize"] = getattr(self, "do_not_standardize", False)
    return state

`init(replace_nan=False, augment=True, descrs=None, avg_ipc=True, do_not_standardize=False, **kwargs)` ¶

RDKit descriptor computation

Parameters:

Name	Type	Description	Default
`replace_nan`	`Optional[bool]`	Whether to replace nan or infinite values. Defaults to False.	`False`
`augment`	`Optional[bool]`	Whether to augment the descriptors with some additional custom features	`True`
`descrs`	`List`	Subset of available features to consider if not None	`None`
`avg_ipc`	`Optional[bool]`	Whether to average IPC values or to use rdkit original	`True`
`do_not_standardize`	`Optional[bool]`	Whether to force standardization of molecule before computation of the descriptor. Set to True if you want molfeat<=0.5.3 behaviour	`False`

Source code in molfeat/calc/descriptors.py

def __init__(
    self,
    replace_nan: Optional[bool] = False,
    augment: Optional[bool] = True,
    descrs: List = None,
    avg_ipc: Optional[bool] = True,
    do_not_standardize: Optional[bool] = False,
    **kwargs,
):
    """RDKit descriptor computation

    Args:
        replace_nan: Whether to replace nan or infinite values. Defaults to False.
        augment: Whether to augment the descriptors with some additional custom features
        descrs: Subset of available features to consider if not None
        avg_ipc: Whether to average IPC values or to use rdkit original
        do_not_standardize: Whether to force standardization of molecule before computation of the descriptor.
            Set to True if you want molfeat<=0.5.3 behaviour
    """
    self.replace_nan = replace_nan
    self.augment = augment
    self.descrs = descrs
    self.avg_ipc = avg_ipc
    self.do_not_standardize = do_not_standardize
    all_features = [d[0] for d in Descriptors.descList]
    if self.augment:
        all_features += [
            "NumAtomStereoCenters",
            "NumUnspecifiedAtomStereoCenters",
            "NumBridgeheadAtoms",
            "NumAmideBonds",
            "NumSpiroAtoms",
            "Alerts",
        ]
    if descrs is not None:
        self._columns = [x for x in descrs if x in all_features]
        unknown_descrs = set(descrs) - set(all_features)
        if len(unknown_descrs) > 0:
            logger.warning(f"Following features are not supported: {unknown_descrs}")
    else:
        self._columns = all_features

`len()` ¶

Return the length of the calculator

Source code in molfeat/calc/descriptors.py

def __len__(self):
    """Return the length of the calculator"""
    return len(self._columns)

`RDKitDescriptors3D` ¶

Bases: SerializableCalculator

Compute a list of 3D rdkit descriptors

Source code in molfeat/calc/descriptors.py

class RDKitDescriptors3D(SerializableCalculator):
    """
    Compute a list of 3D rdkit descriptors
    """

    def __init__(
        self,
        replace_nan: bool = False,
        ignore_descrs: list = ["CalcGETAWAY"],
        **kwargs,
    ):
        """Compute 3D descriptors

        Args:
            replace_nan (bool, optional): Whether to replace nan or infinite values. Defaults to False.
            ignore_descrs (list, optional): Descriptors to ignore for performance issues. Defaults to ["CalcGETAWAY"].
        """
        self.replace_nan = replace_nan

        self._descr = [
            "CalcAsphericity",
            "CalcEccentricity",
            "CalcInertialShapeFactor",
            "CalcNPR1",
            "CalcNPR2",
            "CalcPMI1",
            "CalcPMI2",
            "CalcPMI3",
            "CalcRadiusOfGyration",
            "CalcSpherocityIndex",
            "CalcPBF",
        ]

        self.ignore_descrs = ignore_descrs or []
        self._vec_descr = [
            "CalcAUTOCORR3D",
            "CalcRDF",
            "CalcMORSE",
            "CalcWHIM",
            "CalcGETAWAY",
        ]
        self._vec_descr_length = [80, 210, 224, 114, 273]
        self._columns = [x for x in self._descr if x not in self.ignore_descrs]
        for desc, desc_len in zip(self._vec_descr, self._vec_descr_length):
            if desc in self.ignore_descrs:
                continue
            for pos in range(desc_len):
                self._columns.append(f"{desc}_{pos}")

    def __getstate__(self):
        """Serialize the class for pickling."""
        state = {}
        state["replace_nan"] = self.replace_nan
        state["ignore_descrs"] = self.ignore_descrs
        state["_columns"] = self._columns
        return state

    def __len__(self):
        """Get the length of the descriptor"""
        return len(self._columns)

    @property
    def columns(self):
        """Get the descriptors columns"""
        return self._columns

    @requires_conformer
    def __call__(self, mol: Union[rdchem.Mol, str], conformer_id: Optional[int] = -1):
        r"""
        Get rdkit 3D descriptors for a molecule

        Args:
            mol: the molecule of interest
            conformer_id (int, optional): Optional conformer id. Defaults to -1.

        Returns:
            props (np.ndarray): list of computed mordred molecular descriptors
        """

        mol = dm.to_mol(mol)
        desc_val = []
        for desc in self._descr:
            val = float("nan")
            if desc not in self.ignore_descrs:
                try:
                    val = getattr(Descriptors3D.rdMolDescriptors, desc)(mol, confId=conformer_id)
                except:
                    pass
                desc_val.append(val)
        for i, desc in enumerate(self._vec_descr):
            val = [float("nan")] * self._vec_descr_length[i]
            if desc not in self.ignore_descrs:
                try:
                    val = getattr(Descriptors3D.rdMolDescriptors, desc)(mol, confId=conformer_id)
                except:
                    pass
                desc_val.extend(val)

        desc_val = to_numpy(desc_val)
        if self.replace_nan:
            desc_val = np.nan_to_num(desc_val)
        return desc_val

`columns` `property` ¶

Get the descriptors columns

`ignore_descrs = ignore_descrs or []` `instance-attribute` ¶

`replace_nan = replace_nan` `instance-attribute` ¶

`call(mol, conformer_id=-1)` ¶

Get rdkit 3D descriptors for a molecule

Parameters:

Name	Type	Description	Default
`mol`	`Union[rdchem.Mol, str]`	the molecule of interest	required
`conformer_id`	`int`	Optional conformer id. Defaults to -1.	`-1`

Returns:

Name	Type	Description
`props`	`np.ndarray`	list of computed mordred molecular descriptors

Source code in molfeat/calc/descriptors.py

@requires_conformer
def __call__(self, mol: Union[rdchem.Mol, str], conformer_id: Optional[int] = -1):
    r"""
    Get rdkit 3D descriptors for a molecule

    Args:
        mol: the molecule of interest
        conformer_id (int, optional): Optional conformer id. Defaults to -1.

    Returns:
        props (np.ndarray): list of computed mordred molecular descriptors
    """

    mol = dm.to_mol(mol)
    desc_val = []
    for desc in self._descr:
        val = float("nan")
        if desc not in self.ignore_descrs:
            try:
                val = getattr(Descriptors3D.rdMolDescriptors, desc)(mol, confId=conformer_id)
            except:
                pass
            desc_val.append(val)
    for i, desc in enumerate(self._vec_descr):
        val = [float("nan")] * self._vec_descr_length[i]
        if desc not in self.ignore_descrs:
            try:
                val = getattr(Descriptors3D.rdMolDescriptors, desc)(mol, confId=conformer_id)
            except:
                pass
            desc_val.extend(val)

    desc_val = to_numpy(desc_val)
    if self.replace_nan:
        desc_val = np.nan_to_num(desc_val)
    return desc_val

`getstate()` ¶

Serialize the class for pickling.

Source code in molfeat/calc/descriptors.py

def __getstate__(self):
    """Serialize the class for pickling."""
    state = {}
    state["replace_nan"] = self.replace_nan
    state["ignore_descrs"] = self.ignore_descrs
    state["_columns"] = self._columns
    return state

`init(replace_nan=False, ignore_descrs=['CalcGETAWAY'], **kwargs)` ¶

Compute 3D descriptors

Parameters:

Name	Type	Description	Default
`replace_nan`	`bool`	Whether to replace nan or infinite values. Defaults to False.	`False`
`ignore_descrs`	`list`	Descriptors to ignore for performance issues. Defaults to ["CalcGETAWAY"].	`['CalcGETAWAY']`

Source code in molfeat/calc/descriptors.py

def __init__(
    self,
    replace_nan: bool = False,
    ignore_descrs: list = ["CalcGETAWAY"],
    **kwargs,
):
    """Compute 3D descriptors

    Args:
        replace_nan (bool, optional): Whether to replace nan or infinite values. Defaults to False.
        ignore_descrs (list, optional): Descriptors to ignore for performance issues. Defaults to ["CalcGETAWAY"].
    """
    self.replace_nan = replace_nan

    self._descr = [
        "CalcAsphericity",
        "CalcEccentricity",
        "CalcInertialShapeFactor",
        "CalcNPR1",
        "CalcNPR2",
        "CalcPMI1",
        "CalcPMI2",
        "CalcPMI3",
        "CalcRadiusOfGyration",
        "CalcSpherocityIndex",
        "CalcPBF",
    ]

    self.ignore_descrs = ignore_descrs or []
    self._vec_descr = [
        "CalcAUTOCORR3D",
        "CalcRDF",
        "CalcMORSE",
        "CalcWHIM",
        "CalcGETAWAY",
    ]
    self._vec_descr_length = [80, 210, 224, 114, 273]
    self._columns = [x for x in self._descr if x not in self.ignore_descrs]
    for desc, desc_len in zip(self._vec_descr, self._vec_descr_length):
        if desc in self.ignore_descrs:
            continue
        for pos in range(desc_len):
            self._columns.append(f"{desc}_{pos}")

`len()` ¶

Get the length of the descriptor

Source code in molfeat/calc/descriptors.py

def __len__(self):
    """Get the length of the descriptor"""
    return len(self._columns)

`CATS`¶

CATS 2D and 3D implementation based on original work by Rajarshi Guha rguha@indiana.edu 08/26/07 and Chris Arthur 1/11/2015 Rdkit port This version modernizes the code, improve performance, add supports for 3D as well as allowing distance binning. see: https://masterchemoinfo.u-strasbg.fr/Documents/Conferences/Lecture1_Pharmacophores_Schneider.pdf

`CATS` ¶

Bases: SerializableCalculator

Cats descriptors calculator based on PPPs (potential pharmacophore points). Can be either 2D or 3D.

!!! note: We need to consider all pairwise combination of the 6 PPPs described in CATS2D.SMARTS which would be $P(6,2) + 6$. However, as we only consider lexicographic order, the total size is then $rac{P(6,2)}{2} + 6 = 21$, explaining the size of CATS2D.DESCRIPTORS

Tip

The CATS descriptor are sensitive to the number of atoms in a molecule, meaning, you would get different results if you add or remove hydrogen atoms

Source code in molfeat/calc/cats.py

class CATS(SerializableCalculator):
    """Cats descriptors calculator based on PPPs (potential pharmacophore points). Can be either 2D or 3D.

    !!! note:
        We need to consider all pairwise combination of the 6 PPPs described in `CATS2D.SMARTS`
        which would be $P(6,2) + 6$. However, as we only consider lexicographic order, the total size
        is then $\frac{P(6,2)}{2} + 6 = 21$, explaining the size of `CATS2D.DESCRIPTORS`

    !!! tip
        The CATS descriptor are sensitive to the number of atoms in a molecule, meaning, you would get different
        results if you add or remove hydrogen atoms

    """

    SMARTS = {
        "D": ["[!$([#6,H0,-,-2,-3])]"],
        "A": ["[!$([#6,F,Cl,Br,I,o,s,nX3,#7v5,#15v5,#16v4,#16v6,*+1,*+2,*+3])]"],
        "P": ["[*+]", "[#7H2]"],
        "N": ["[*-]", "[C&$(C(=O)O),P&$(P(=O)),S&$(S(=O)O)]"],
        "L": [
            "[Cl,Br,I]",
            "[S;D2;$(S(C)(C))]",
            "[C;D2;$(C(=C)(=C))]",
            "[C;D3;$(C(=C)(C)(C))]",
            "[C;D4;$(C(C)(C)(C)(C))]",
            "[C;D3;H1;$(C(C)(C)(C))]",
            "[C;D2;H2;$(C(C)(C))]",
        ],
        "R": ["[a]"],
    }

    DESCRIPTORS = [
        "DD",
        "AD",
        "DP",
        "DN",
        "DL",
        "DR",
        "AA",
        "AP",
        "AN",
        "AL",
        "AR",
        "PP",
        "NP",
        "LP",
        "PR",
        "NN",
        "LN",
        "NR",
        "LL",
        "LR",
        "RR",
    ]

    MAX_DIST_DEFAULT_2D = 8
    MAX_DIST_DEFAULT_3D = 5

    def __init__(
        self,
        max_dist: Union[float, int] = None,
        bins: List[int] = None,
        scale: str = "raw",
        use_3d_distances: bool = False,
        **kwargs,
    ):
        """Calculator for the CATS descriptors.

        `max_dist` and `bins` will both determine the length of the fingerprint vector,
        which you can get by calling `len(calc)`

        Args:
            max_dist: Maximum distance between pairs. When set to None, the default for 2D is
                set to `max_dist=8` and for 3D to `max_dist=5`.
            bins: Bins to use. Defaults to equal spacing `[0, max_dist[`.
            scale: How to scale the values. Supported values are:
                 - 'raw' for the raw values.
                 - 'num' for values normalized by the number of atoms.
                 - 'count' for scaling based on occurence of the PPP.
            use_3d_distances: Whether to use the 3D distances instead of the topological distances.
                If set to True, the input molecules must contain a conformer.
            kwargs: silently ignored extra parameters for compatibility with other calculators.
        """

        # Set the max_dist default is set to None
        if max_dist is None:
            if use_3d_distances:
                max_dist = CATS.MAX_DIST_DEFAULT_3D
            else:
                max_dist = CATS.MAX_DIST_DEFAULT_2D

        self.max_dist = max_dist
        self.use_3d_distances = use_3d_distances

        if bins is None:
            bins = list(np.arange(1, np.floor(self.max_dist + 1), 1))

        # we don't allow interaction that exceed our distance threshold.
        bins = [x for x in bins if x <= self.max_dist]

        # we start distance indexing at 0
        if 0 not in bins:
            bins += [0]

        self.bins = list(sorted(bins))

        self.scale = scale

        self._set_columns()

    def _set_columns(self):
        self._columns = []
        for label in self.DESCRIPTORS:
            for i in range(len(self.bins)):
                self._columns.append(f"{label}.bins-{i}")

    @classmethod
    @functools.lru_cache(maxsize=None)
    def _pattern_to_mols(cls, smarts_dict=None):
        """Convert dict of list of smarts to rdkit molecules"""

        if smarts_dict is None:
            smarts_dict = cls.SMARTS

        smarts_mols = ddict(list)
        for label, patterns in smarts_dict.items():
            patterns = [dm.from_smarts(patt) for patt in patterns]
            smarts_mols[label] = patterns

        return smarts_mols

    def _get_pcore_group(self, mol: Union[rdchem.Mol, str]):
        """
        Assign a PPP (potential pharmacophore points) to individual atoms of a molecule.

        !!! note
            The return value is a list of length `N_atoms` of the
            input molecule. The i'th element of the list contains
            a list of PPP labels that were identified for the i'th atom

        Args:
            mol: the molecule of interest

        Returns:
            ppp_labels (List[list]): list of all PPP labels for each atoms
        """

        smarts_mols = CATS._pattern_to_mols()

        ppp_labels = ["" for x in range(0, mol.GetNumAtoms())]
        for label, patterns in smarts_mols.items():
            for pattern in patterns:
                matched = False
                for matchbase in mol.GetSubstructMatches(pattern, uniquify=True):
                    for idx in matchbase:
                        if ppp_labels[idx] == "":
                            ppp_labels[idx] = [label]
                        else:
                            tmp = ppp_labels[idx]
                            tmp.append(label)
                            ppp_labels[idx] = tmp
                    matched = True
                if matched:
                    break
        return ppp_labels

    def _get_ppp_matrix(self, n_atoms: int, ppp_labels: List):
        """Compute PPP matrix from label list

        Args:
            n_atoms (int): number of atoms
            ppp_labels (list): PPP labels returned by

        Returns:
            pppm (dict): PPP matrix where the keys are the coordinate
        """

        pppm = {}
        for i in range(0, n_atoms):
            ppp_i = ppp_labels[i]
            if ppp_i == "":
                continue
            for j in range(0, n_atoms):
                ppp_j = ppp_labels[j]
                if ppp_j == "":
                    continue
                pairs = []
                for x in ppp_i:
                    for y in ppp_j:
                        if (x, y) not in pairs and (y, x) not in pairs:
                            ## make sure to add the labels in increasing
                            ## lexicographical order
                            if x < y:
                                tmp = (x, y)
                            else:
                                tmp = (y, x)
                            pairs.append(tmp)
                pppm[(i, j)] = pairs
        return pppm

    def _calculate(self, mol, dist_mat):
        """Calculate the CATS2D descriptors for current molecule, given a distance matrix"""

        n_atoms = mol.GetNumAtoms()
        ppp_labels = self._get_pcore_group(mol)
        ppp_mat = self._get_ppp_matrix(n_atoms, ppp_labels)

        # get the counturence of each of the PPP's
        ppp_count = dict(zip(["D", "N", "A", "P", "L", "R"], [0] * 6))
        for label in ppp_labels:
            for ppp in label:
                ppp_count[ppp] = ppp_count[ppp] + 1

        # lets calculate the CATS2D raw descriptor
        # bins: a, b, c ==> [a, b], [b, c], [c, *]
        # a is always 0
        desc = [[0 for x in range(len(self.bins))] for x in range(0, len(self.DESCRIPTORS))]
        for (x, y), labels in ppp_mat.items():
            dist = dist_mat[x, y]
            # ignore all interactions greater than the max distance we set
            # we cannot have negative distance
            if dist > self.max_dist or dist < 0:
                continue

            for pair in labels:
                idx = self.DESCRIPTORS.index(f"{pair[0]}{pair[1]}")
                vals = desc[idx]
                dist_bin = np.digitize(dist, self.bins)
                # indexing at 0
                vals[dist_bin - 1] += 1
                desc[idx] = vals

        if self.scale == "num":
            for row in range(0, len(desc)):
                for col in range(0, len(desc[0])):
                    desc[row][col] = float(desc[row][col]) / n_atoms

        elif self.scale == "count":
            #  get the scaling factors
            facs = [0] * len(self.DESCRIPTORS)
            count = 0
            for ppp in self.DESCRIPTORS:
                facs[count] = ppp_count[ppp[0]] + ppp_count[ppp[1]]
                count += 1

            # each row in desc corresponds to a PPP pair
            # so the scale factor is constant over cols of a row
            count = 0
            for i in range(0, len(desc)):
                if facs[i] == 0:
                    continue
                for j in range(0, len(desc[0])):
                    desc[i][j] = desc[i][j] / float(facs[i])

        res = []
        for row in desc:
            for col in row:
                res.append(col)
        return res

    def __len__(self):
        """Return the length of the calculator"""
        return len(self._columns)

    def __call__(self, mol: Union[dm.Mol, str], conformer_id: int = -1):
        """Get CATS 2D descriptors for a molecule

        Args:
            mol: the molecule of interest.
            conformer_id: Optional conformer id. Only relevant when `use_3d_distances`
                is set to True.

        Returns:
            props (np.ndarray): list of computed rdkit molecular descriptors
        """

        mol = dm.to_mol(mol)

        if self.use_3d_distances:
            if mol.GetNumConformers() < 1:  # type: ignore
                raise ValueError("Expected a molecule with conformers information.")

            dist_mat = Get3DDistanceMatrix(mol, confId=conformer_id)

        else:
            dist_mat = GetDistanceMatrix(mol).astype(int)

        out = self._calculate(mol, dist_mat)
        return to_numpy(out)

    @property
    def columns(self):
        """Get the descriptors columns"""
        return self._columns

    def __getstate__(self):
        """Serialize the class for pickling."""
        state = {}
        state["max_dist"] = self.max_dist
        state["bins"] = self.bins
        state["scale"] = self.scale
        state["use_3d_distances"] = self.use_3d_distances
        return state

    def __setstate__(self, state: dict):
        """Reload the class from pickling."""
        self.__dict__.update(state)
        self._set_columns()

`DESCRIPTORS = ['DD', 'AD', 'DP', 'DN', 'DL', 'DR', 'AA', 'AP', 'AN', 'AL', 'AR', 'PP', 'NP', 'LP', 'PR', 'NN', 'LN', 'NR', 'LL', 'LR', 'RR']` `class-attribute` ¶

`MAX_DIST_DEFAULT_2D = 8` `class-attribute` ¶

`MAX_DIST_DEFAULT_3D = 5` `class-attribute` ¶

`SMARTS = {'D': ['[!$([#6,H0,-,-2,-3])]'], 'A': ['[!$([#6,F,Cl,Br,I,o,s,nX3,#7v5,#15v5,#16v4,#16v6,+1,+2,+3])]'], 'P': ['[+]', '[#7H2]'], 'N': ['[*-]', '[C&$(C(=O)O),P&$(P(=O)),S&$(S(=O)O)]'], 'L': ['[Cl,Br,I]', '[S;D2;$(S(C)(C))]', '[C;D2;$(C(=C)(=C))]', '[C;D3;$(C(=C)(C)(C))]', '[C;D4;$(C(C)(C)(C)(C))]', '[C;D3;H1;$(C(C)(C)(C))]', '[C;D2;H2;$(C(C)(C))]'], 'R': ['[a]']}` `class-attribute` ¶

`bins = list(sorted(bins))` `instance-attribute` ¶

`columns` `property` ¶

Get the descriptors columns

`max_dist = max_dist` `instance-attribute` ¶

`scale = scale` `instance-attribute` ¶

`use_3d_distances = use_3d_distances` `instance-attribute` ¶

`call(mol, conformer_id=-1)` ¶

Get CATS 2D descriptors for a molecule

Parameters:

Name	Type	Description	Default
`mol`	`Union[dm.Mol, str]`	the molecule of interest.	required
`conformer_id`	`int`	Optional conformer id. Only relevant when `use_3d_distances` is set to True.	`-1`

Returns:

Name	Type	Description
`props`	`np.ndarray`	list of computed rdkit molecular descriptors

Source code in molfeat/calc/cats.py

def __call__(self, mol: Union[dm.Mol, str], conformer_id: int = -1):
    """Get CATS 2D descriptors for a molecule

    Args:
        mol: the molecule of interest.
        conformer_id: Optional conformer id. Only relevant when `use_3d_distances`
            is set to True.

    Returns:
        props (np.ndarray): list of computed rdkit molecular descriptors
    """

    mol = dm.to_mol(mol)

    if self.use_3d_distances:
        if mol.GetNumConformers() < 1:  # type: ignore
            raise ValueError("Expected a molecule with conformers information.")

        dist_mat = Get3DDistanceMatrix(mol, confId=conformer_id)

    else:
        dist_mat = GetDistanceMatrix(mol).astype(int)

    out = self._calculate(mol, dist_mat)
    return to_numpy(out)

`getstate()` ¶

Serialize the class for pickling.

Source code in molfeat/calc/cats.py

def __getstate__(self):
    """Serialize the class for pickling."""
    state = {}
    state["max_dist"] = self.max_dist
    state["bins"] = self.bins
    state["scale"] = self.scale
    state["use_3d_distances"] = self.use_3d_distances
    return state

`init(max_dist=None, bins=None, scale='raw', use_3d_distances=False, **kwargs)` ¶

Calculator for the CATS descriptors.

max_dist and bins will both determine the length of the fingerprint vector, which you can get by calling len(calc)

Parameters:

Name	Type	Description	Default
`max_dist`	`Union[float, int]`	Maximum distance between pairs. When set to None, the default for 2D is set to `max_dist=8` and for 3D to `max_dist=5`.	`None`
`bins`	`List[int]`	Bins to use. Defaults to equal spacing `[0, max_dist[`.	`None`
`scale`	`str`	How to scale the values. Supported values are: - 'raw' for the raw values. - 'num' for values normalized by the number of atoms. - 'count' for scaling based on occurence of the PPP.	`'raw'`
`use_3d_distances`	`bool`	Whether to use the 3D distances instead of the topological distances. If set to True, the input molecules must contain a conformer.	`False`
`kwargs`		silently ignored extra parameters for compatibility with other calculators.	`{}`

Source code in molfeat/calc/cats.py

def __init__(
    self,
    max_dist: Union[float, int] = None,
    bins: List[int] = None,
    scale: str = "raw",
    use_3d_distances: bool = False,
    **kwargs,
):
    """Calculator for the CATS descriptors.

    `max_dist` and `bins` will both determine the length of the fingerprint vector,
    which you can get by calling `len(calc)`

    Args:
        max_dist: Maximum distance between pairs. When set to None, the default for 2D is
            set to `max_dist=8` and for 3D to `max_dist=5`.
        bins: Bins to use. Defaults to equal spacing `[0, max_dist[`.
        scale: How to scale the values. Supported values are:
             - 'raw' for the raw values.
             - 'num' for values normalized by the number of atoms.
             - 'count' for scaling based on occurence of the PPP.
        use_3d_distances: Whether to use the 3D distances instead of the topological distances.
            If set to True, the input molecules must contain a conformer.
        kwargs: silently ignored extra parameters for compatibility with other calculators.
    """

    # Set the max_dist default is set to None
    if max_dist is None:
        if use_3d_distances:
            max_dist = CATS.MAX_DIST_DEFAULT_3D
        else:
            max_dist = CATS.MAX_DIST_DEFAULT_2D

    self.max_dist = max_dist
    self.use_3d_distances = use_3d_distances

    if bins is None:
        bins = list(np.arange(1, np.floor(self.max_dist + 1), 1))

    # we don't allow interaction that exceed our distance threshold.
    bins = [x for x in bins if x <= self.max_dist]

    # we start distance indexing at 0
    if 0 not in bins:
        bins += [0]

    self.bins = list(sorted(bins))

    self.scale = scale

    self._set_columns()

`len()` ¶

Return the length of the calculator

Source code in molfeat/calc/cats.py

def __len__(self):
    """Return the length of the calculator"""
    return len(self._columns)

`setstate(state)` ¶

Reload the class from pickling.

Source code in molfeat/calc/cats.py

def __setstate__(self, state: dict):
    """Reload the class from pickling."""
    self.__dict__.update(state)
    self._set_columns()

`Pharmacophore`¶

`Pharmacophore2D` ¶

Bases: SerializableCalculator

2D Pharmacophore.

The fingerprint is computed using Generate.Gen2DFingerprint from RDKit.

An explanation of pharmacophore fingerprints and how the bits are set is available in the RDKit book. In particular the following figure describes the process. { align=left }

Source code in molfeat/calc/pharmacophore.py

class Pharmacophore2D(SerializableCalculator):
    """2D Pharmacophore.

    The fingerprint is computed using `Generate.Gen2DFingerprint` from RDKit.

    An explanation of pharmacophore fingerprints and how the bits are set
    is available in the RDKit book. In particular the following figure describes the process.
    ![Pharmacophore](https://www.rdkit.org/docs/_images/picture_10.jpg){ align=left }
    """

    def __init__(
        self,
        factory: Union[str, MolChemicalFeatureFactory] = "pmapper",
        length: Optional[int] = 2048,
        useCounts: bool = None,
        minPointCount: int = None,
        maxPointCount: int = None,
        shortestPathsOnly: bool = None,
        includeBondOrder: bool = None,
        skipFeats: List[str] = None,
        trianglePruneBins: bool = None,
        bins: List[Tuple[int, int]] = None,
        **kwargs,
    ):
        """Pharmacophore computation.

        Args:
            factory: Which features factory to use. One of "default", "cats", "gobbi" , "pmapper" or path
                to a feature definition or a feature factory object
            length: Optional desired length. If provided, the fp will be refold or padded to that length.
                If set to None, fallback to the default for the provided sig factory.
            minPointCount: Minimum number of points.
            maxPointCount: Maximum number of points.
            trianglePruneBins: Whether to prune the triangle inequality.
            includeBondOrder: Whether to consider bond order.
            shortestPathsOnly: Whether to only use the shortest path between pharmacophores.
            useCounts: Whether take into account the count information. This will also impact how the folding works.
            bins: Bins to use.
        """

        self.factory = factory
        self.useCounts = useCounts
        self.minPointCount = minPointCount
        self.maxPointCount = maxPointCount
        self.shortestPathsOnly = shortestPathsOnly
        self.includeBondOrder = includeBondOrder
        self.skipFeats = skipFeats
        self.trianglePruneBins = trianglePruneBins
        self.bins = bins

        self.length = length

        self._init_sig_factory()

    def __call__(self, mol: Union[dm.Mol, str], raw: bool = False):
        """Compute the Pharmacophore fingeprint for the input molecule.

        Args:
            mol: the molecule of interest
            raw: Whether to return the raw fingerprint or a Numpy array.

        Returns:
            fp: the computed fingerprint as a Numpy array or as a raw object.
        """

        # Get a molecule
        mol = dm.to_mol(mol)

        if mol is None:
            raise ValueError("The input molecule is not valid.")

        # Get distance matrix
        use_bond_order = self.sig_factory.includeBondOrder
        d_mat = rdmolops.GetDistanceMatrix(mol, use_bond_order)

        # Generate the fingerprint
        fp = Generate.Gen2DFingerprint(mol, self.sig_factory, dMat=d_mat)

        # Posprocessing
        if self.length and self._should_fold:
            # refold the fingerprint
            fp = fold_count_fp(fp, dim=self.length, binary=not (self.useCounts or False))
            if raw:
                fp = to_fp(fp, bitvect=True)

        if not raw:
            fp = to_numpy(fp)

        return fp

    def _init_sig_factory(self):
        """Init the feature factory for this pharmacophore."""

        self.sig_factory = get_sig_factory(
            self.factory,
            useCounts=self.useCounts,
            minPointCount=self.minPointCount,
            maxPointCount=self.maxPointCount,
            shortestPathsOnly=self.shortestPathsOnly,
            includeBondOrder=self.includeBondOrder,
            skipFeats=self.skipFeats,
            trianglePruneBins=self.trianglePruneBins,
            bins=self.bins,
        )

        # Reinject used params to the class attributes
        # It might be useful in case the default values are changed
        # and when serializing the object.
        self.useCounts = self.sig_factory.useCounts
        self.minPointCount = self.sig_factory.minPointCount
        self.maxPointCount = self.sig_factory.maxPointCount
        self.shortestPathsOnly = self.sig_factory.shortestPathsOnly
        self.includeBondOrder = self.sig_factory.includeBondOrder
        self.skipFeats = self.sig_factory.skipFeats
        self.trianglePruneBins = self.sig_factory.trianglePruneBins
        self.bins = self.sig_factory.GetBins()

    @property
    @functools.lru_cache(maxsize=None)
    def _should_fold(self):
        return self.sig_factory.GetSigSize() != len(self)

    @property
    def feature_factory(self):
        return self.sig_factory.featFactory

    def __len__(self):
        """Returns the length of the pharmacophore"""
        return self.length or self.sig_factory.GetSigSize()

    @property
    def columns(self):
        """Get the name of all the descriptors of this calculator."""

        if not self.length:
            return [self.sig_factory.GetBitDescription(x) for x in range(len(self))]
        else:
            return [f"Desc:{i}" for i in range(self.length)]

    def __getstate__(self):
        """Serialize the class for pickling."""
        state = {}
        state["factory"] = self.factory
        state["useCounts"] = self.useCounts
        state["minPointCount"] = self.minPointCount
        state["maxPointCount"] = self.maxPointCount
        state["shortestPathsOnly"] = self.shortestPathsOnly
        state["includeBondOrder"] = self.includeBondOrder
        state["skipFeats"] = self.skipFeats
        state["trianglePruneBins"] = self.trianglePruneBins
        state["bins"] = self.bins
        state["length"] = self.length
        return state

    def __setstate__(self, state: dict):
        """Reload the class from pickling."""
        self.__dict__.update(state)
        self._init_sig_factory()

`bins = bins` `instance-attribute` ¶

`columns` `property` ¶

Get the name of all the descriptors of this calculator.

`factory = factory` `instance-attribute` ¶

`feature_factory` `property` ¶

`includeBondOrder = includeBondOrder` `instance-attribute` ¶

`length = length` `instance-attribute` ¶

`maxPointCount = maxPointCount` `instance-attribute` ¶

`minPointCount = minPointCount` `instance-attribute` ¶

`shortestPathsOnly = shortestPathsOnly` `instance-attribute` ¶

`skipFeats = skipFeats` `instance-attribute` ¶

`trianglePruneBins = trianglePruneBins` `instance-attribute` ¶

`useCounts = useCounts` `instance-attribute` ¶

`call(mol, raw=False)` ¶

Compute the Pharmacophore fingeprint for the input molecule.

Parameters:

Name	Type	Description	Default
`mol`	`Union[dm.Mol, str]`	the molecule of interest	required
`raw`	`bool`	Whether to return the raw fingerprint or a Numpy array.	`False`

Returns:

Name	Type	Description
`fp`		the computed fingerprint as a Numpy array or as a raw object.

Source code in molfeat/calc/pharmacophore.py

def __call__(self, mol: Union[dm.Mol, str], raw: bool = False):
    """Compute the Pharmacophore fingeprint for the input molecule.

    Args:
        mol: the molecule of interest
        raw: Whether to return the raw fingerprint or a Numpy array.

    Returns:
        fp: the computed fingerprint as a Numpy array or as a raw object.
    """

    # Get a molecule
    mol = dm.to_mol(mol)

    if mol is None:
        raise ValueError("The input molecule is not valid.")

    # Get distance matrix
    use_bond_order = self.sig_factory.includeBondOrder
    d_mat = rdmolops.GetDistanceMatrix(mol, use_bond_order)

    # Generate the fingerprint
    fp = Generate.Gen2DFingerprint(mol, self.sig_factory, dMat=d_mat)

    # Posprocessing
    if self.length and self._should_fold:
        # refold the fingerprint
        fp = fold_count_fp(fp, dim=self.length, binary=not (self.useCounts or False))
        if raw:
            fp = to_fp(fp, bitvect=True)

    if not raw:
        fp = to_numpy(fp)

    return fp

`getstate()` ¶

Serialize the class for pickling.

Source code in molfeat/calc/pharmacophore.py

def __getstate__(self):
    """Serialize the class for pickling."""
    state = {}
    state["factory"] = self.factory
    state["useCounts"] = self.useCounts
    state["minPointCount"] = self.minPointCount
    state["maxPointCount"] = self.maxPointCount
    state["shortestPathsOnly"] = self.shortestPathsOnly
    state["includeBondOrder"] = self.includeBondOrder
    state["skipFeats"] = self.skipFeats
    state["trianglePruneBins"] = self.trianglePruneBins
    state["bins"] = self.bins
    state["length"] = self.length
    return state

`init(factory='pmapper', length=2048, useCounts=None, minPointCount=None, maxPointCount=None, shortestPathsOnly=None, includeBondOrder=None, skipFeats=None, trianglePruneBins=None, bins=None, **kwargs)` ¶

Pharmacophore computation.

Parameters:

Name	Type	Description	Default
`factory`	`Union[str, MolChemicalFeatureFactory]`	Which features factory to use. One of "default", "cats", "gobbi" , "pmapper" or path to a feature definition or a feature factory object	`'pmapper'`
`length`	`Optional[int]`	Optional desired length. If provided, the fp will be refold or padded to that length. If set to None, fallback to the default for the provided sig factory.	`2048`
`minPointCount`	`int`	Minimum number of points.	`None`
`maxPointCount`	`int`	Maximum number of points.	`None`
`trianglePruneBins`	`bool`	Whether to prune the triangle inequality.	`None`
`includeBondOrder`	`bool`	Whether to consider bond order.	`None`
`shortestPathsOnly`	`bool`	Whether to only use the shortest path between pharmacophores.	`None`
`useCounts`	`bool`	Whether take into account the count information. This will also impact how the folding works.	`None`
`bins`	`List[Tuple[int, int]]`	Bins to use.	`None`

Source code in molfeat/calc/pharmacophore.py

def __init__(
    self,
    factory: Union[str, MolChemicalFeatureFactory] = "pmapper",
    length: Optional[int] = 2048,
    useCounts: bool = None,
    minPointCount: int = None,
    maxPointCount: int = None,
    shortestPathsOnly: bool = None,
    includeBondOrder: bool = None,
    skipFeats: List[str] = None,
    trianglePruneBins: bool = None,
    bins: List[Tuple[int, int]] = None,
    **kwargs,
):
    """Pharmacophore computation.

    Args:
        factory: Which features factory to use. One of "default", "cats", "gobbi" , "pmapper" or path
            to a feature definition or a feature factory object
        length: Optional desired length. If provided, the fp will be refold or padded to that length.
            If set to None, fallback to the default for the provided sig factory.
        minPointCount: Minimum number of points.
        maxPointCount: Maximum number of points.
        trianglePruneBins: Whether to prune the triangle inequality.
        includeBondOrder: Whether to consider bond order.
        shortestPathsOnly: Whether to only use the shortest path between pharmacophores.
        useCounts: Whether take into account the count information. This will also impact how the folding works.
        bins: Bins to use.
    """

    self.factory = factory
    self.useCounts = useCounts
    self.minPointCount = minPointCount
    self.maxPointCount = maxPointCount
    self.shortestPathsOnly = shortestPathsOnly
    self.includeBondOrder = includeBondOrder
    self.skipFeats = skipFeats
    self.trianglePruneBins = trianglePruneBins
    self.bins = bins

    self.length = length

    self._init_sig_factory()

`len()` ¶

Returns the length of the pharmacophore

Source code in molfeat/calc/pharmacophore.py

def __len__(self):
    """Returns the length of the pharmacophore"""
    return self.length or self.sig_factory.GetSigSize()

`setstate(state)` ¶

Reload the class from pickling.

Source code in molfeat/calc/pharmacophore.py

def __setstate__(self, state: dict):
    """Reload the class from pickling."""
    self.__dict__.update(state)
    self._init_sig_factory()

`Pharmacophore3D` ¶

Bases: SerializableCalculator

3D Pharmacophore.

The fingerprint is computed using pmapper.

This featurizer supports building a consensus pharmacophore from a set of molecules.

Source code in molfeat/calc/pharmacophore.py

class Pharmacophore3D(SerializableCalculator):
    """3D Pharmacophore.

    The fingerprint is computed using [`pmapper`](https://github.com/DrrDom/pmapper).

    This featurizer supports building a consensus pharmacophore from a set of molecules.
    """

    def __init__(
        self,
        factory: Union[str, MolChemicalFeatureFactory] = "pmapper",
        length: int = 2048,
        bin_step: float = 1,
        min_features: int = 2,
        max_features: int = 3,
        use_modulo: bool = True,
        tolerance: float = 0,
    ):
        """Pharmacophore computation.

        Args:
            factory: Which features factory to use. One of "default", "cats", "gobbi" , "pmapper" or path
                to a feature definition or a feature factory object
            length: Optional desired length. If provided, the fp will be refold or padded to that length.
                If set to None, fallback to the default for the provided sig factory.
            minPointCount: Minimum number of points.
            maxPointCount: Maximum number of points.
            trianglePruneBins: Whether to prune the triangle inequality.
            includeBondOrder: Whether to consider bond order.
            shortestPathsOnly: Whether to only use the shortest path between pharmacophores.
            useCounts: Whether take into account the count information. This will also impact how the folding works.
            bins: Bins to use.
        """

        self.factory = factory
        self.length = length
        self.bin_step = bin_step
        self.min_features = min_features
        self.max_features = max_features
        self.use_modulo = use_modulo
        self.tolerance = tolerance

        self._init_feature_factory()

    def __call__(self, mol: Union[dm.Mol, str], conformer_id: int = -1, raw: bool = False):
        """Compute the Pharmacophore fingeprint for the input molecule.

        Args:
            mol: the molecule of interest
            conformer_id: the conformer id to use.
            raw: Whether to return the raw fingerprint or a Numpy array.

        Returns:
            fp: the computed fingerprint as a Numpy array.
        """

        # Get a molecule
        mol = dm.to_mol(mol)

        if mol is None:
            raise ValueError("The input molecule is not valid.")

        if mol.GetNumConformers() < 1:  # type: ignore
            raise ValueError("Expected a molecule with conformers information.")

        # Get the features for the mol
        features = self.get_features(mol, conformer_id=conformer_id)

        # Convert features dataframe to coordinates
        if features.empty:
            features_coords = []
        else:
            features_coords = features[["feature_name", "coords"]].values.tolist()

        # Compute the fingerprint
        fp = self.compute_fp_from_coords(features_coords, raw=raw)

        return fp

    def consensus_fp(
        self,
        mols: List[dm.Mol],
        align: bool = True,
        conformer_id: int = -1,
        copy: bool = True,
        min_samples_ratio: float = 0.5,
        eps: float = 2,
        raw: bool = False,
        **cluster_kwargs,
    ):
        """Compute a consensus fingerprint from a list of molecules.

        Args:
            mols: a list of molecules.
            align: Whether to align the conformers of the molecules.
            conformer_id: Optional conformer id.
            copy: Whether to copy the molecules before clustering.
            min_samples_ratio: Percentages of mols that must contain a pharmacophoric point
                to be considered as a core point.
            eps: The maximum distance between two samples for one to be considered as
                in the neighborhood of the other.
            raw: Whether to return the raw fingerprint or a Numpy array.
            cluster_kwargs: additional keyword arguments for the clustering algorithm.
        """

        # Get all the features
        features = self.get_features_from_many(
            mols,
            keep_mols=True,
            align=align,
            conformer_id=conformer_id,
            copy=copy,
        )

        # Retrieve the aligned molecules
        mols = features.groupby("mol_index").first()["mol"].tolist()
        # Cluster the features
        clustered_features = self.cluster_features(
            features, min_samples_ratio=min_samples_ratio, eps=eps, **cluster_kwargs
        )
        # Convert features dataframe to coordinates
        if clustered_features.empty:
            features_coords = []
        else:
            features_coords = clustered_features[["feature_name", "coords"]].values.tolist()
        # Compute the fingerprint
        fp = self.compute_fp_from_coords(features_coords, raw=raw)

        return fp

    def _init_feature_factory(self):
        """Init the feature factory."""
        self.feature_factory = get_feature_factory(self.factory)

    def get_features(self, mol: dm.Mol, conformer_id: int = -1) -> pd.DataFrame:
        """Retrieve the features for a given molecule.

        Args:
            mol: the molecule of interest

        Returns:
            features: the features as a Numpy array
        """
        features_data = []

        # Extract the features for this molecule
        features = self.feature_factory.GetFeaturesForMol(mol, confId=conformer_id)

        # Extract all the feature atom indices for this molecule
        for feature in features:
            datum = {}
            datum["feature_id"] = feature.GetId()
            datum["feature_name"] = feature.GetFamily()
            datum["feature_type"] = feature.GetType()
            datum["atom_indices"] = feature.GetAtomIds()
            datum["coords"] = np.array(feature.GetPos())

            features_data.append(datum)

        features_data = pd.DataFrame(features_data)

        return features_data

    def get_features_from_many(
        self,
        mols: List[dm.Mol],
        align: bool = True,
        conformer_id: int = -1,
        copy: bool = True,
        keep_mols: bool = False,
    ):
        """Extract all the features from a list of molecules after an optional
        alignement step.

        Args:
            mols: List of molecules with conformers.
            align: Whether to align the conformers of the molecules.
            conformer_id: Optional conformer id.
            copy: Whether to copy the molecules before clustering.
            keep_mols: Whether to keep the molecules in the returned dataframe.
        """

        if not all([mol.GetNumConformers() >= 1 for mol in mols]):
            raise ValueError("One or more input molecules is missing a conformer.")

        # Make a copy of the molecules since they are going to be modified
        if copy:
            mols = [dm.copy_mol(mol) for mol in mols]

        # Align the conformers
        if align:
            mols, _ = commons.align_conformers(mols, copy=False, conformer_id=conformer_id)

        all_features = pd.DataFrame()

        for i, mol in enumerate(mols):
            features = self.get_features(mol)
            features["mol_index"] = i

            if keep_mols:
                features["mol"] = mol

            all_features = pd.concat([all_features, features], ignore_index=True)

        return all_features

    def compute_fp_from_coords(
        self,
        features_coords: List[Tuple[str, Tuple[float]]],
        raw: bool = False,
    ):
        """Compute a fingerprint from a list of features.

        Args:
            features_coords: Features coords: `[('A', (1.23, 2.34, 3.45)), ('A', (4.56, 5.67, 6.78)), ...]`.
            raw: Whether to return the raw fingerprint or a Numpy array.
        """

        # Init the pmapper engine
        ph_engine = Pharm(bin_step=self.bin_step)
        # Convert coords to list in case those are arrays
        features_coords = [(name, tuple(coords)) for name, coords in features_coords]
        # Load pharmacophore points
        ph_engine.load_from_feature_coords(features_coords)
        # Init the iterator over the pharmacophore points
        points_iterator = ph_engine.iterate_pharm(
            min_features=self.min_features,
            max_features=self.max_features,
            tol=self.tolerance,
            return_feature_ids=False,
        )

        # Compute the fingerprint
        on_bits = set()
        for h in points_iterator:
            if self.use_modulo:
                on_bits.add(int(h, 16) % self.length)  # type: ignore
            else:
                random.seed(int(h, 16))  # type: ignore
                on_bits.add(random.randrange(self.length))

        if raw:
            return np.array(on_bits)

        fp = np.zeros(self.length, dtype=int)
        fp[list(on_bits)] = 1

        return fp

    def cluster_features(
        self,
        features: pd.DataFrame,
        min_samples_ratio: float = 0.5,
        n_mols: int = None,
        eps: float = np.inf,
        **kwargs,
    ):
        """Cluster a set of pharmacophoric features using OPTICS.
        The only reason why we are not using SpectralClustering is because of the need to provide
        the number of clusters.

        Args:
            features: A dataframe of features.
            min_samples_ratio: Percentages of mols that must contain a pharmacophoric point
                to be considered as a core point.
            n_mols: Optional number of compounds to compute `min_samples` from the
                `min_samples_ratio` value. If not set it will use `mol_index` from
                the `features` dataframe.
            eps: The maximum distance between two samples for one to be considered as
                in the neighborhood of the other. This is max_eps in OPTICS
            kwargs: Any additional parameters to pass to `sklearn.cluster.OPTICS`.
        """

        if n_mols is None:
            n_mols = len(features["mol_index"].unique())

        # Compute min_samples
        min_samples = max(int(round(min_samples_ratio * n_mols, 0)), 1)
        clusters = []
        feature_id = 0
        for _, rows in features.groupby("feature_name"):
            feature_name = rows.iloc[0]["feature_name"]
            if min_samples > rows.shape[0]:
                logger.info(
                    f"Feature {feature_name} does not have enough molecule ({len(rows)}), skipping"
                )
                continue
            coords = np.vstack(rows["coords"].values)

            # Init clustering
            optics = OPTICS(min_samples=min_samples, max_eps=eps, **kwargs)
            optics = optics.fit(coords)
            labels = optics.labels_
            # a node that is not a core would basically be a node that cannot be labeled
            # thus border nodes are considered core
            core_samples_mask = np.zeros_like(labels, dtype=bool)
            core_samples_mask[labels == 1] = True

            # Find the centroids (consensus points)
            unique_labels = set(labels)
            for k in unique_labels:
                if k == -1:
                    continue
                class_member_mask = labels == k
                cluster_coords = coords[class_member_mask & core_samples_mask]
                if len(cluster_coords) == 0:
                    continue
                cluster_centroid = cluster_coords.mean(axis=0)

                cluster = {}
                cluster["feature_id"] = feature_id
                cluster["feature_name"] = feature_name
                cluster["coords"] = cluster_centroid
                cluster["cluster_size"] = len(cluster_coords)

                clusters.append(cluster)
                feature_id += 1

        clusters = pd.DataFrame(clusters)

        return clusters

    ## Viz methods

    def show(
        self,
        mol: dm.Mol,
        features: pd.DataFrame = None,
        alpha: float = 1.0,
        sphere_radius: float = 0.4,
        show_legend: bool = True,
    ):
        """Show a 3D view of a given molecule with the pharmacophoric features.

        Args:
            mol: the molecule of interest
            alpha: Alpha value for the colors (currently not working).
            sphere_radius: Radius of the spheres for the features.
            show_legend: Display the legend (the layout is bad but at least it
                shows the legend).
        """

        if features is None:
            features = self.get_features(mol)

        return viz.show_pharm_features(
            mol,
            features=features,
            feature_factory=self.feature_factory,
            alpha=alpha,
            sphere_radius=sphere_radius,
            show_legend=show_legend,
        )

    def show_many(
        self,
        mols: List[dm.Mol],
        align: bool = True,
        conformer_id: int = -1,
        copy: bool = True,
        min_samples_ratio: float = 0.5,
        eps: float = 2,
        alpha: float = 1.0,
        sphere_radius: float = 0.4,
        show_legend: bool = True,
    ):
        """Show a 3D view of a given molecule with the pharmacophoric features.

        Args:
            mols: a list of molecules.
            align: Whether to align the conformers of the molecules.
            conformer_id: Optional conformer id.
            copy: Whether to copy the molecules before clustering.
            min_samples_ratio: Percentages of mols that must contain a pharmacophoric point
                to be considered as a core point.
            eps: The maximum distance between two samples for one to be considered as
                in the neighborhood of the other.
            alpha: Alpha value for the colors (currently not working).
            sphere_radius: Radius of the spheres for the features.
            show_legend: Display the legend (the layout is bad but at least it
                shows the legend).
        """

        # Get all the features
        features = self.get_features_from_many(
            mols,
            keep_mols=True,
            align=align,
            conformer_id=conformer_id,
            copy=copy,
        )

        # Retrieve the aligned molecules
        mols = features.groupby("mol_index").first()["mol"].tolist()

        # Cluster the features
        clustered_features = self.cluster_features(
            features,
            min_samples_ratio=min_samples_ratio,
            eps=eps,
        )

        return viz.show_pharm_features(
            mols,
            features=clustered_features,
            feature_factory=self.feature_factory,
            alpha=alpha,
            sphere_radius=sphere_radius,
            show_legend=show_legend,
        )

    def __getstate__(self):
        """Serialize the class for pickling."""
        state = {}
        state["factory"] = self.factory
        state["length"] = self.length
        state["bin_step"] = self.bin_step
        state["min_features"] = self.min_features
        state["max_features"] = self.max_features
        state["use_modulo"] = self.use_modulo
        state["tolerance"] = self.tolerance
        return state

    def __setstate__(self, state: dict):
        """Reload the class from pickling."""
        self.__dict__.update(state)
        self._init_feature_factory()

`bin_step = bin_step` `instance-attribute` ¶

`factory = factory` `instance-attribute` ¶

`length = length` `instance-attribute` ¶

`max_features = max_features` `instance-attribute` ¶

`min_features = min_features` `instance-attribute` ¶

`tolerance = tolerance` `instance-attribute` ¶

`use_modulo = use_modulo` `instance-attribute` ¶

`call(mol, conformer_id=-1, raw=False)` ¶

Compute the Pharmacophore fingeprint for the input molecule.

Parameters:

Name	Type	Description	Default
`mol`	`Union[dm.Mol, str]`	the molecule of interest	required
`conformer_id`	`int`	the conformer id to use.	`-1`
`raw`	`bool`	Whether to return the raw fingerprint or a Numpy array.	`False`

Returns:

Name	Type	Description
`fp`		the computed fingerprint as a Numpy array.

Source code in molfeat/calc/pharmacophore.py

def __call__(self, mol: Union[dm.Mol, str], conformer_id: int = -1, raw: bool = False):
    """Compute the Pharmacophore fingeprint for the input molecule.

    Args:
        mol: the molecule of interest
        conformer_id: the conformer id to use.
        raw: Whether to return the raw fingerprint or a Numpy array.

    Returns:
        fp: the computed fingerprint as a Numpy array.
    """

    # Get a molecule
    mol = dm.to_mol(mol)

    if mol is None:
        raise ValueError("The input molecule is not valid.")

    if mol.GetNumConformers() < 1:  # type: ignore
        raise ValueError("Expected a molecule with conformers information.")

    # Get the features for the mol
    features = self.get_features(mol, conformer_id=conformer_id)

    # Convert features dataframe to coordinates
    if features.empty:
        features_coords = []
    else:
        features_coords = features[["feature_name", "coords"]].values.tolist()

    # Compute the fingerprint
    fp = self.compute_fp_from_coords(features_coords, raw=raw)

    return fp

`getstate()` ¶

Serialize the class for pickling.

Source code in molfeat/calc/pharmacophore.py

def __getstate__(self):
    """Serialize the class for pickling."""
    state = {}
    state["factory"] = self.factory
    state["length"] = self.length
    state["bin_step"] = self.bin_step
    state["min_features"] = self.min_features
    state["max_features"] = self.max_features
    state["use_modulo"] = self.use_modulo
    state["tolerance"] = self.tolerance
    return state

`init(factory='pmapper', length=2048, bin_step=1, min_features=2, max_features=3, use_modulo=True, tolerance=0)` ¶

Pharmacophore computation.

Parameters:

Name	Type	Description	Default
`factory`	`Union[str, MolChemicalFeatureFactory]`	Which features factory to use. One of "default", "cats", "gobbi" , "pmapper" or path to a feature definition or a feature factory object	`'pmapper'`
`length`	`int`	Optional desired length. If provided, the fp will be refold or padded to that length. If set to None, fallback to the default for the provided sig factory.	`2048`
`minPointCount`		Minimum number of points.	required
`maxPointCount`		Maximum number of points.	required
`trianglePruneBins`		Whether to prune the triangle inequality.	required
`includeBondOrder`		Whether to consider bond order.	required
`shortestPathsOnly`		Whether to only use the shortest path between pharmacophores.	required
`useCounts`		Whether take into account the count information. This will also impact how the folding works.	required
`bins`		Bins to use.	required

Source code in molfeat/calc/pharmacophore.py

def __init__(
    self,
    factory: Union[str, MolChemicalFeatureFactory] = "pmapper",
    length: int = 2048,
    bin_step: float = 1,
    min_features: int = 2,
    max_features: int = 3,
    use_modulo: bool = True,
    tolerance: float = 0,
):
    """Pharmacophore computation.

    Args:
        factory: Which features factory to use. One of "default", "cats", "gobbi" , "pmapper" or path
            to a feature definition or a feature factory object
        length: Optional desired length. If provided, the fp will be refold or padded to that length.
            If set to None, fallback to the default for the provided sig factory.
        minPointCount: Minimum number of points.
        maxPointCount: Maximum number of points.
        trianglePruneBins: Whether to prune the triangle inequality.
        includeBondOrder: Whether to consider bond order.
        shortestPathsOnly: Whether to only use the shortest path between pharmacophores.
        useCounts: Whether take into account the count information. This will also impact how the folding works.
        bins: Bins to use.
    """

    self.factory = factory
    self.length = length
    self.bin_step = bin_step
    self.min_features = min_features
    self.max_features = max_features
    self.use_modulo = use_modulo
    self.tolerance = tolerance

    self._init_feature_factory()

`setstate(state)` ¶

Reload the class from pickling.

Source code in molfeat/calc/pharmacophore.py

def __setstate__(self, state: dict):
    """Reload the class from pickling."""
    self.__dict__.update(state)
    self._init_feature_factory()

`cluster_features(features, min_samples_ratio=0.5, n_mols=None, eps=np.inf, **kwargs)` ¶

Cluster a set of pharmacophoric features using OPTICS. The only reason why we are not using SpectralClustering is because of the need to provide the number of clusters.

Parameters:

Name	Type	Description	Default
`features`	`pd.DataFrame`	A dataframe of features.	required
`min_samples_ratio`	`float`	Percentages of mols that must contain a pharmacophoric point to be considered as a core point.	`0.5`
`n_mols`	`int`	Optional number of compounds to compute `min_samples` from the `min_samples_ratio` value. If not set it will use `mol_index` from the `features` dataframe.	`None`
`eps`	`float`	The maximum distance between two samples for one to be considered as in the neighborhood of the other. This is max_eps in OPTICS	`np.inf`
`kwargs`		Any additional parameters to pass to `sklearn.cluster.OPTICS`.	`{}`

Source code in molfeat/calc/pharmacophore.py

def cluster_features(
    self,
    features: pd.DataFrame,
    min_samples_ratio: float = 0.5,
    n_mols: int = None,
    eps: float = np.inf,
    **kwargs,
):
    """Cluster a set of pharmacophoric features using OPTICS.
    The only reason why we are not using SpectralClustering is because of the need to provide
    the number of clusters.

    Args:
        features: A dataframe of features.
        min_samples_ratio: Percentages of mols that must contain a pharmacophoric point
            to be considered as a core point.
        n_mols: Optional number of compounds to compute `min_samples` from the
            `min_samples_ratio` value. If not set it will use `mol_index` from
            the `features` dataframe.
        eps: The maximum distance between two samples for one to be considered as
            in the neighborhood of the other. This is max_eps in OPTICS
        kwargs: Any additional parameters to pass to `sklearn.cluster.OPTICS`.
    """

    if n_mols is None:
        n_mols = len(features["mol_index"].unique())

    # Compute min_samples
    min_samples = max(int(round(min_samples_ratio * n_mols, 0)), 1)
    clusters = []
    feature_id = 0
    for _, rows in features.groupby("feature_name"):
        feature_name = rows.iloc[0]["feature_name"]
        if min_samples > rows.shape[0]:
            logger.info(
                f"Feature {feature_name} does not have enough molecule ({len(rows)}), skipping"
            )
            continue
        coords = np.vstack(rows["coords"].values)

        # Init clustering
        optics = OPTICS(min_samples=min_samples, max_eps=eps, **kwargs)
        optics = optics.fit(coords)
        labels = optics.labels_
        # a node that is not a core would basically be a node that cannot be labeled
        # thus border nodes are considered core
        core_samples_mask = np.zeros_like(labels, dtype=bool)
        core_samples_mask[labels == 1] = True

        # Find the centroids (consensus points)
        unique_labels = set(labels)
        for k in unique_labels:
            if k == -1:
                continue
            class_member_mask = labels == k
            cluster_coords = coords[class_member_mask & core_samples_mask]
            if len(cluster_coords) == 0:
                continue
            cluster_centroid = cluster_coords.mean(axis=0)

            cluster = {}
            cluster["feature_id"] = feature_id
            cluster["feature_name"] = feature_name
            cluster["coords"] = cluster_centroid
            cluster["cluster_size"] = len(cluster_coords)

            clusters.append(cluster)
            feature_id += 1

    clusters = pd.DataFrame(clusters)

    return clusters

`compute_fp_from_coords(features_coords, raw=False)` ¶

Compute a fingerprint from a list of features.

Parameters:

Name	Type	Description	Default
`features_coords`	`List[Tuple[str, Tuple[float]]]`	Features coords: `[('A', (1.23, 2.34, 3.45)), ('A', (4.56, 5.67, 6.78)), ...]`.	required
`raw`	`bool`	Whether to return the raw fingerprint or a Numpy array.	`False`

Source code in molfeat/calc/pharmacophore.py

def compute_fp_from_coords(
    self,
    features_coords: List[Tuple[str, Tuple[float]]],
    raw: bool = False,
):
    """Compute a fingerprint from a list of features.

    Args:
        features_coords: Features coords: `[('A', (1.23, 2.34, 3.45)), ('A', (4.56, 5.67, 6.78)), ...]`.
        raw: Whether to return the raw fingerprint or a Numpy array.
    """

    # Init the pmapper engine
    ph_engine = Pharm(bin_step=self.bin_step)
    # Convert coords to list in case those are arrays
    features_coords = [(name, tuple(coords)) for name, coords in features_coords]
    # Load pharmacophore points
    ph_engine.load_from_feature_coords(features_coords)
    # Init the iterator over the pharmacophore points
    points_iterator = ph_engine.iterate_pharm(
        min_features=self.min_features,
        max_features=self.max_features,
        tol=self.tolerance,
        return_feature_ids=False,
    )

    # Compute the fingerprint
    on_bits = set()
    for h in points_iterator:
        if self.use_modulo:
            on_bits.add(int(h, 16) % self.length)  # type: ignore
        else:
            random.seed(int(h, 16))  # type: ignore
            on_bits.add(random.randrange(self.length))

    if raw:
        return np.array(on_bits)

    fp = np.zeros(self.length, dtype=int)
    fp[list(on_bits)] = 1

    return fp

`consensus_fp(mols, align=True, conformer_id=-1, copy=True, min_samples_ratio=0.5, eps=2, raw=False, **cluster_kwargs)` ¶

Compute a consensus fingerprint from a list of molecules.

Parameters:

Name	Type	Description	Default
`mols`	`List[dm.Mol]`	a list of molecules.	required
`align`	`bool`	Whether to align the conformers of the molecules.	`True`
`conformer_id`	`int`	Optional conformer id.	`-1`
`copy`	`bool`	Whether to copy the molecules before clustering.	`True`
`min_samples_ratio`	`float`	Percentages of mols that must contain a pharmacophoric point to be considered as a core point.	`0.5`
`eps`	`float`	The maximum distance between two samples for one to be considered as in the neighborhood of the other.	`2`
`raw`	`bool`	Whether to return the raw fingerprint or a Numpy array.	`False`
`cluster_kwargs`		additional keyword arguments for the clustering algorithm.	`{}`

Source code in molfeat/calc/pharmacophore.py

def consensus_fp(
    self,
    mols: List[dm.Mol],
    align: bool = True,
    conformer_id: int = -1,
    copy: bool = True,
    min_samples_ratio: float = 0.5,
    eps: float = 2,
    raw: bool = False,
    **cluster_kwargs,
):
    """Compute a consensus fingerprint from a list of molecules.

    Args:
        mols: a list of molecules.
        align: Whether to align the conformers of the molecules.
        conformer_id: Optional conformer id.
        copy: Whether to copy the molecules before clustering.
        min_samples_ratio: Percentages of mols that must contain a pharmacophoric point
            to be considered as a core point.
        eps: The maximum distance between two samples for one to be considered as
            in the neighborhood of the other.
        raw: Whether to return the raw fingerprint or a Numpy array.
        cluster_kwargs: additional keyword arguments for the clustering algorithm.
    """

    # Get all the features
    features = self.get_features_from_many(
        mols,
        keep_mols=True,
        align=align,
        conformer_id=conformer_id,
        copy=copy,
    )

    # Retrieve the aligned molecules
    mols = features.groupby("mol_index").first()["mol"].tolist()
    # Cluster the features
    clustered_features = self.cluster_features(
        features, min_samples_ratio=min_samples_ratio, eps=eps, **cluster_kwargs
    )
    # Convert features dataframe to coordinates
    if clustered_features.empty:
        features_coords = []
    else:
        features_coords = clustered_features[["feature_name", "coords"]].values.tolist()
    # Compute the fingerprint
    fp = self.compute_fp_from_coords(features_coords, raw=raw)

    return fp

`get_features(mol, conformer_id=-1)` ¶

Retrieve the features for a given molecule.

Parameters:

Name	Type	Description	Default
`mol`	`dm.Mol`	the molecule of interest	required

Returns:

Name	Type	Description
`features`	`pd.DataFrame`	the features as a Numpy array

Source code in molfeat/calc/pharmacophore.py

def get_features(self, mol: dm.Mol, conformer_id: int = -1) -> pd.DataFrame:
    """Retrieve the features for a given molecule.

    Args:
        mol: the molecule of interest

    Returns:
        features: the features as a Numpy array
    """
    features_data = []

    # Extract the features for this molecule
    features = self.feature_factory.GetFeaturesForMol(mol, confId=conformer_id)

    # Extract all the feature atom indices for this molecule
    for feature in features:
        datum = {}
        datum["feature_id"] = feature.GetId()
        datum["feature_name"] = feature.GetFamily()
        datum["feature_type"] = feature.GetType()
        datum["atom_indices"] = feature.GetAtomIds()
        datum["coords"] = np.array(feature.GetPos())

        features_data.append(datum)

    features_data = pd.DataFrame(features_data)

    return features_data

`get_features_from_many(mols, align=True, conformer_id=-1, copy=True, keep_mols=False)` ¶

Extract all the features from a list of molecules after an optional alignement step.

Parameters:

Name	Type	Description	Default
`mols`	`List[dm.Mol]`	List of molecules with conformers.	required
`align`	`bool`	Whether to align the conformers of the molecules.	`True`
`conformer_id`	`int`	Optional conformer id.	`-1`
`copy`	`bool`	Whether to copy the molecules before clustering.	`True`
`keep_mols`	`bool`	Whether to keep the molecules in the returned dataframe.	`False`

Source code in molfeat/calc/pharmacophore.py

def get_features_from_many(
    self,
    mols: List[dm.Mol],
    align: bool = True,
    conformer_id: int = -1,
    copy: bool = True,
    keep_mols: bool = False,
):
    """Extract all the features from a list of molecules after an optional
    alignement step.

    Args:
        mols: List of molecules with conformers.
        align: Whether to align the conformers of the molecules.
        conformer_id: Optional conformer id.
        copy: Whether to copy the molecules before clustering.
        keep_mols: Whether to keep the molecules in the returned dataframe.
    """

    if not all([mol.GetNumConformers() >= 1 for mol in mols]):
        raise ValueError("One or more input molecules is missing a conformer.")

    # Make a copy of the molecules since they are going to be modified
    if copy:
        mols = [dm.copy_mol(mol) for mol in mols]

    # Align the conformers
    if align:
        mols, _ = commons.align_conformers(mols, copy=False, conformer_id=conformer_id)

    all_features = pd.DataFrame()

    for i, mol in enumerate(mols):
        features = self.get_features(mol)
        features["mol_index"] = i

        if keep_mols:
            features["mol"] = mol

        all_features = pd.concat([all_features, features], ignore_index=True)

    return all_features

`show(mol, features=None, alpha=1.0, sphere_radius=0.4, show_legend=True)` ¶

Show a 3D view of a given molecule with the pharmacophoric features.

Parameters:

Name	Type	Description	Default
`mol`	`dm.Mol`	the molecule of interest	required
`alpha`	`float`	Alpha value for the colors (currently not working).	`1.0`
`sphere_radius`	`float`	Radius of the spheres for the features.	`0.4`
`show_legend`	`bool`	Display the legend (the layout is bad but at least it shows the legend).	`True`

Source code in molfeat/calc/pharmacophore.py

def show(
    self,
    mol: dm.Mol,
    features: pd.DataFrame = None,
    alpha: float = 1.0,
    sphere_radius: float = 0.4,
    show_legend: bool = True,
):
    """Show a 3D view of a given molecule with the pharmacophoric features.

    Args:
        mol: the molecule of interest
        alpha: Alpha value for the colors (currently not working).
        sphere_radius: Radius of the spheres for the features.
        show_legend: Display the legend (the layout is bad but at least it
            shows the legend).
    """

    if features is None:
        features = self.get_features(mol)

    return viz.show_pharm_features(
        mol,
        features=features,
        feature_factory=self.feature_factory,
        alpha=alpha,
        sphere_radius=sphere_radius,
        show_legend=show_legend,
    )

`show_many(mols, align=True, conformer_id=-1, copy=True, min_samples_ratio=0.5, eps=2, alpha=1.0, sphere_radius=0.4, show_legend=True)` ¶

Show a 3D view of a given molecule with the pharmacophoric features.

Parameters:

Name	Type	Description	Default
`mols`	`List[dm.Mol]`	a list of molecules.	required
`align`	`bool`	Whether to align the conformers of the molecules.	`True`
`conformer_id`	`int`	Optional conformer id.	`-1`
`copy`	`bool`	Whether to copy the molecules before clustering.	`True`
`min_samples_ratio`	`float`	Percentages of mols that must contain a pharmacophoric point to be considered as a core point.	`0.5`
`eps`	`float`	The maximum distance between two samples for one to be considered as in the neighborhood of the other.	`2`
`alpha`	`float`	Alpha value for the colors (currently not working).	`1.0`
`sphere_radius`	`float`	Radius of the spheres for the features.	`0.4`
`show_legend`	`bool`	Display the legend (the layout is bad but at least it shows the legend).	`True`

Source code in molfeat/calc/pharmacophore.py

def show_many(
    self,
    mols: List[dm.Mol],
    align: bool = True,
    conformer_id: int = -1,
    copy: bool = True,
    min_samples_ratio: float = 0.5,
    eps: float = 2,
    alpha: float = 1.0,
    sphere_radius: float = 0.4,
    show_legend: bool = True,
):
    """Show a 3D view of a given molecule with the pharmacophoric features.

    Args:
        mols: a list of molecules.
        align: Whether to align the conformers of the molecules.
        conformer_id: Optional conformer id.
        copy: Whether to copy the molecules before clustering.
        min_samples_ratio: Percentages of mols that must contain a pharmacophoric point
            to be considered as a core point.
        eps: The maximum distance between two samples for one to be considered as
            in the neighborhood of the other.
        alpha: Alpha value for the colors (currently not working).
        sphere_radius: Radius of the spheres for the features.
        show_legend: Display the legend (the layout is bad but at least it
            shows the legend).
    """

    # Get all the features
    features = self.get_features_from_many(
        mols,
        keep_mols=True,
        align=align,
        conformer_id=conformer_id,
        copy=copy,
    )

    # Retrieve the aligned molecules
    mols = features.groupby("mol_index").first()["mol"].tolist()

    # Cluster the features
    clustered_features = self.cluster_features(
        features,
        min_samples_ratio=min_samples_ratio,
        eps=eps,
    )

    return viz.show_pharm_features(
        mols,
        features=clustered_features,
        feature_factory=self.feature_factory,
        alpha=alpha,
        sphere_radius=sphere_radius,
        show_legend=show_legend,
    )

`get_feature_factory(factory)` ¶

Build a feature factory.

Source code in molfeat/calc/pharmacophore.py

def get_feature_factory(
    factory: Union[str, MolChemicalFeatureFactory]
) -> MolChemicalFeatureFactory:
    """Build a feature factory."""

    if isinstance(factory, MolChemicalFeatureFactory):
        feature_factory = factory

    elif factory == "pmapper":
        with pkg_resources.path("pmapper", "smarts_features.fdef") as fdef_name:
            feature_factory = ChemicalFeatures.BuildFeatureFactory(str(fdef_name))  # type: ignore

    elif factory == "gobbi":
        feature_factory = Gobbi_Pharm2D.factory.featFactory

    elif factory == "cats":
        with pkg_resources.open_text("molfeat.data", "cats_features.fdef") as instream:
            feature_factory = ChemicalFeatures.BuildFeatureFactoryFromString(instream.read())  # type: ignore

    elif factory == "default":
        # Load default feature definition file
        fdefFile = os.path.join(RDConfig.RDDataDir, "BaseFeatures.fdef")
        feature_factory = ChemicalFeatures.BuildFeatureFactory(fdefFile)  # type: ignore

    elif dm.fs.exists(factory):
        with fsspec.open(factory, "r") as instream:
            fdef = instream.read()
            feature_factory = ChemicalFeatures.BuildFeatureFactoryFromString(fdef)  # type: ignore

    else:
        raise ValueError(f"The factory '{factory}' is not supported.")

    return feature_factory

`get_sig_factory(factory, useCounts=None, minPointCount=None, maxPointCount=None, shortestPathsOnly=None, includeBondOrder=None, skipFeats=None, trianglePruneBins=None, bins=None, init_factory=True)` ¶

Build a signature factory.

Source code in molfeat/calc/pharmacophore.py

def get_sig_factory(
    factory: Union[str, MolChemicalFeatureFactory],
    useCounts: bool = None,
    minPointCount: int = None,
    maxPointCount: int = None,
    shortestPathsOnly: bool = None,
    includeBondOrder: bool = None,
    skipFeats: List[str] = None,
    trianglePruneBins: bool = None,
    bins: List[Tuple[int, int]] = None,
    init_factory: bool = True,
):
    """Build a signature factory."""

    # Get feature factory
    feature_factory = get_feature_factory(factory)

    # Get default params and override them as needed
    params, bins = get_sig_factory_params(
        factory,
        useCounts=useCounts,
        minPointCount=minPointCount,
        maxPointCount=maxPointCount,
        shortestPathsOnly=shortestPathsOnly,
        includeBondOrder=includeBondOrder,
        skipFeats=skipFeats,
        trianglePruneBins=trianglePruneBins,
        bins=bins,
    )

    # Build signature factory
    sig_factory = SigFactory(feature_factory, **params)

    # Set bins
    sig_factory.SetBins(bins)

    # Init the factory
    if init_factory:
        sig_factory.Init()

    return sig_factory

`get_sig_factory_params(factory_name, useCounts=None, minPointCount=None, maxPointCount=None, shortestPathsOnly=None, includeBondOrder=None, skipFeats=None, trianglePruneBins=None, bins=None)` ¶

Get the default parameter for a given sig factory allowing some of them to be overriden.

Parameters:

Name	Type	Description	Default
`factory_name`	`str`	The name of the factory.	required

Source code in molfeat/calc/pharmacophore.py

def get_sig_factory_params(
    factory_name: str,
    useCounts: bool = None,
    minPointCount: int = None,
    maxPointCount: int = None,
    shortestPathsOnly: bool = None,
    includeBondOrder: bool = None,
    skipFeats: List[str] = None,
    trianglePruneBins: bool = None,
    bins: List[Tuple[int, int]] = None,
) -> Tuple[Dict[str, Any], list]:
    """Get the default parameter for a given sig factory allowing some of them to be overriden.

    Args:
        factory_name: The name of the factory.
    """

    # Get default params.

    if factory_name == "cats":
        default_bins = [
            (0, 1),
            (1, 2),
            (2, 3),
            (3, 4),
            (4, 5),
            (5, 6),
            (6, 7),
            (7, 8),
            (8, 9),
        ]
        params = dict(
            useCounts=True,
            minPointCount=2,
            maxPointCount=2,
            trianglePruneBins=True,
            shortestPathsOnly=True,
            includeBondOrder=False,
        )

    elif factory_name == "gobbi":
        default_bins = [(2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 8), (8, 100)]
        params = dict(
            useCounts=False,
            minPointCount=2,
            maxPointCount=3,
            trianglePruneBins=True,
            shortestPathsOnly=True,
            includeBondOrder=False,
        )

    elif factory_name == "pmapper":
        default_bins = [(2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 8), (8, 100)]
        params = dict(
            useCounts=False,
            minPointCount=2,
            maxPointCount=3,
            trianglePruneBins=False,
            shortestPathsOnly=True,
            includeBondOrder=False,
        )

    elif factory_name == "default":
        params = dict(
            useCounts=False,
            minPointCount=2,
            maxPointCount=3,
            trianglePruneBins=False,
            shortestPathsOnly=True,
            skipFeats=["ZnBinder", "LumpedHydrophobe"],
            includeBondOrder=False,
        )
        default_bins = [(2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 8), (8, 100)]

    else:
        raise ValueError(f"Default values for {factory_name} are not known.")

    # Override default params when set.

    if minPointCount is not None:
        params["minPointCount"] = minPointCount

    if maxPointCount is not None:
        params["maxPointCount"] = maxPointCount

    if trianglePruneBins is not None:
        params["trianglePruneBins"] = trianglePruneBins

    if includeBondOrder is not None:
        params["includeBondOrder"] = includeBondOrder

    if useCounts is not None:
        params["useCounts"] = useCounts

    if skipFeats is not None:
        params["skipFeats"] = skipFeats  # type: ignore

    if shortestPathsOnly is not None:
        params["shortestPathsOnly"] = shortestPathsOnly

    bins = bins or default_bins

    return params, bins

`Scaffold Keys`¶

`ScaffoldKeyCalculator` ¶

Bases: SerializableCalculator

Implementation of the Scaffold Keys described in Identification of Bioisosteric Scaffolds using Scaffold Keys by Peter Ertl

Source code in molfeat/calc/skeys.py

class ScaffoldKeyCalculator(SerializableCalculator):
    """
    Implementation of the Scaffold Keys described in
    `Identification of Bioisosteric Scaffolds using Scaffold Keys` by Peter Ertl
    """

    DESCRIPTORS = [
        "n_atom_in_rings",
        "n_atom_in_conjugated_ring",
        "n_atoms_not_in_conjugated_ring",
        "n_atom_in_chain",
        "n_atom_exocyclic",
        "n_nitrogen",
        "n_nitrogen_in_ring",
        "n_oxygen",
        "n_oxygen_in_ring",
        "n_sulfur",
        "n_heteroatoms",
        "n_heteroatoms_in_ring",
        "n_atom_spiro_atoms",
        "n_heteroatom_more_than_2_conn",
        "n_carbon_atleast_2_heteroatoms",
        "n_atom_at_least_2_nei_more_than_2_conn",
        "abs_scaffold_format_charge",
        "n_bonds",
        "n_multiple_non_conj_ring_bonds",
        "n_bonds_2_heteroatoms",
        "n_carbon_het_carbon_het_bonds",
        "n_bonds_at_least_3_conn",
        "n_exocyclic_single_bonds_carbon",
        "n_exocyclic_single_bonds_nitrogen",
        "n_non_ring_bonds_2_conj_rings",
        "n_non_ring_bonds_conj_nonconj_rings",
        "n_bonds_atoms_with_at_least_one_nei_with_2_conn",
        "n_simple_rings",
        "size_largest_ring",
        "n_simple_rings_no_heteroatoms",
        "n_simple_rings_1_heteroatoms",
        "n_simple_rings_2_heteroatoms",
        "n_simple_rings_at_least_3_heteroatoms",
        "n_simple_non_conj_5_atoms_rings",
        "n_simple_non_conj_6_atoms_rings",
        "n_ring_system",
        "n_ring_system_with_2_non_conj_simple_ring",
        "n_ring_system_with_2_conj_simple_ring",
        "n_ring_system_with_conj_non_conj_simple_ring",
        "n_ring_system_with_3_conj_simple_ring",
        "n_ring_system_with_3_non_conj_simple_ring",
        "n_ring_system_with_greater_one_conj_nonconj_simple_ring",
    ]

    NORM_PARAMS = pd.read_csv(
        Path(molfeat.__file__).parents[0].joinpath("data/skey_parameters.csv"),
        index_col=0,
    ).loc[DESCRIPTORS]

    def __init__(
        self, normalize: bool = False, verbose: bool = False, use_scaffold: bool = False, **kwargs
    ):
        """
        Init of the scaffold key function

        Args:
            normalize: whether to normalize the value of the feature
            verbose: whether to log errors
            use_scaffold: whether to convert the molecule into scaffold first
        """
        self.normalize = normalize
        self.verbose = verbose
        self.use_scaffold = use_scaffold

    def __getstate__(self):
        """Get state of the scaffold key function"""
        state = {}
        state["normalize"] = self.normalize
        state["verbose"] = self.verbose
        state["use_scaffold"] = self.use_scaffold
        return state

    def __len__(self):
        return len(self.DESCRIPTORS)

    @classmethod
    def compute_normalization(cls, features: np.ndarray):
        """Normalize input features. The normalization parameters are
        computed by the scaffolds of 2.1M molecules from CHEMBL 29.
        """
        return (features - cls.NORM_PARAMS["mean"]) / cls.NORM_PARAMS["std"]

    def n_atom_in_rings(self, mol: rdchem.Mol):
        """1. number of ring atoms"""
        sm = dm.from_smarts("[r]")
        return len(mol.GetSubstructMatches(sm, uniquify=True))

    def n_atom_in_conjugated_ring(self, mol: rdchem.Mol):
        """2. number of atoms in conjugated rings"""
        ri = mol.GetRingInfo()
        n = 0
        for ring in ri.AtomRings():
            if _is_ring_fully_conjugated(mol, ring):
                n += len(ring)
        return n

    def n_atoms_not_in_conjugated_ring(self, mol: rdchem.Mol):
        """
        3. number of atoms not in conjugated rings
        (i.e. atoms in aliphatic rings and non-ring atoms)
        """
        # EN: replace conjugation by aromatic
        ri = mol.GetRingInfo()
        n = 0
        for ring in ri.AtomRings():
            if not _is_ring_fully_conjugated(mol, ring):
                n += len(ring)
        return n

    def n_atom_in_chain(self, mol: rdchem.Mol):
        """4. number atoms in chains (not counting double-connected exo-chain atoms)"""
        sm = dm.from_smarts("[!r;!$(*=[r])]")
        return len(mol.GetSubstructMatches(sm, uniquify=True))

    def n_atom_exocyclic(self, mol: rdchem.Mol):
        """5. number of exocyclic atoms (connected by multiple bonds to a ring)"""
        sm = dm.from_smarts("[!r;!$(*-[r])&$(*~[r])]")
        return len(mol.GetSubstructMatches(sm, uniquify=True))

    def n_nitrogen(self, mol: rdchem.Mol):
        """6. number of nitrogen"""
        sm = dm.from_smarts("[#7]")
        return len(mol.GetSubstructMatches(sm, uniquify=True))

    def n_nitrogen_in_ring(self, mol: rdchem.Mol):
        """7. number of nitrogen in rings"""
        sm = dm.from_smarts("[#7;r]")
        return len(mol.GetSubstructMatches(sm, uniquify=True))

    def n_oxygen(self, mol: rdchem.Mol):
        """8. number of oxygen"""
        sm = dm.from_smarts("[#8]")
        return len(mol.GetSubstructMatches(sm, uniquify=True))

    def n_oxygen_in_ring(self, mol: rdchem.Mol):
        """9. number of oxygen in rings"""
        sm = dm.from_smarts("[#8]")
        return len(mol.GetSubstructMatches(sm, uniquify=True))

    def n_sulfur(self, mol: rdchem.Mol):
        """10. number of sulfur atoms"""
        sm = dm.from_smarts("[#16]")
        return len(mol.GetSubstructMatches(sm, uniquify=True))

    def n_heteroatoms(self, mol: rdchem.Mol):
        """11. number of heteroatoms"""

        sm = dm.from_smarts("[!#1&!#6]")
        return len(mol.GetSubstructMatches(sm, uniquify=True))

    def n_heteroatoms_in_ring(self, mol: rdchem.Mol):
        """12. number of heteroatoms in rings"""
        sm = dm.from_smarts("[!#1&!#6&r]")
        return len(mol.GetSubstructMatches(sm, uniquify=True))

    def n_atom_spiro_atoms(self, mol: rdchem.Mol):
        """13. number of spiro atoms"""
        return Desc.CalcNumSpiroAtoms(mol)

    def n_heteroatom_more_than_2_conn(self, mol: rdchem.Mol):
        """14. number of heteroatoms with more than 2 connections"""
        sm = dm.from_smarts("[!#1;!#6;!D1!D0;!D2]")
        return len(mol.GetSubstructMatches(sm, uniquify=True))

    def n_carbon_atleast_2_heteroatoms(self, mol: rdchem.Mol):
        """15. number of carbon atoms connected to at least 2 heteroatoms"""
        n_atoms = 0
        for atom in mol.GetAtoms():
            tmp = [x for x in atom.GetNeighbors() if x.GetAtomicNum() not in [1, 6]]
            n_atoms += len(tmp) >= 2
        return n_atoms

    def n_atom_at_least_2_nei_more_than_2_conn(self, mol: rdchem.Mol):
        """16. Number of atoms where at least 2 connected atoms have more than 2 connections"""
        n_atoms = 0
        for atom in mol.GetAtoms():
            tmp = [x for x in atom.GetNeighbors() if len(x.GetNeighbors()) > 2]
            n_atoms += len(tmp) > 2
        return n_atoms

    def abs_scaffold_format_charge(self, mol: rdchem.Mol):
        """17. absolute value of the scaffold formal charge"""
        charge = GetFormalCharge(mol)
        return abs(charge)

    def n_bonds(self, mol: rdchem.Mol):
        """18. number of bonds"""
        return mol.GetNumBonds()

    def n_multiple_non_conj_ring_bonds(self, mol: rdchem.Mol):
        """19. number of multiple, nonconjugated ring bonds"""
        extracted_rings = []
        nr_multiple_bonds_infcr = 0  # infcr: in not fully conjugated ring
        rings = Chem.GetSymmSSSR(mol)
        for i in range(len(rings)):
            extracted_rings.append(list(rings[i]))
        for ring in extracted_rings:
            if not _is_ring_fully_conjugated(mol, ring):
                nr_multiple_bonds_infcr += _n_multiple_bond_in_ring(mol, ring)
        return nr_multiple_bonds_infcr

    def n_bonds_2_heteroatoms(self, mol: rdchem.Mol):
        """20. number of bonds connecting 2 heteroatoms"""
        sm = dm.from_smarts("[!#1&!#6]~[!#1&!#6]")
        return len(mol.GetSubstructMatches(sm, uniquify=True))

    def n_carbon_het_carbon_het_bonds(self, mol: rdchem.Mol):
        """21. number of bonds connecting 2 heteroatoms through 2 carbons"""
        sm = dm.from_smarts("[!#1&!#6]~[#6]~[#6]~[!#1&!#6]")
        return len(mol.GetSubstructMatches(sm, uniquify=True))

    def n_bonds_at_least_3_conn(self, mol: rdchem.Mol):
        """22. number of bonds with at least 3 connections on both its atoms"""
        sm = dm.from_smarts("[$([!#1](~[!#1])(~[!#1])~[!#1])][$([!#1](~[!#1])(~[!#1])~[!#1])]")
        return len(mol.GetSubstructMatches(sm, uniquify=True))

    def n_exocyclic_single_bonds_carbon(self, mol: rdchem.Mol):
        """23. number of exocyclic single bonds where a ring atom is carbon"""
        sm = dm.from_smarts("[!R;!#1]-[#6;R]")
        return len(mol.GetSubstructMatches(sm, uniquify=True))

    def n_exocyclic_single_bonds_nitrogen(self, mol: rdchem.Mol):
        """24. number of exocyclic single bonds where a ring atom is nitrogen"""
        sm = dm.from_smarts("[!R;!#1]-[#7;R]")
        return len(mol.GetSubstructMatches(sm, uniquify=True))

    def n_non_ring_bonds_2_conj_rings(self, mol: rdchem.Mol):
        """25. number of non-ring bonds connecting 2 nonconjugated rings"""
        # EN: this is interpretated literally as bonds and not path
        ring_atom_conj_state = _ring_atom_state(mol)
        sm = dm.from_smarts("[R:1]!@[R:2]")
        bond_list = mol.GetSubstructMatches(sm, uniquify=True)
        result = 0
        for a_start, a_end in bond_list:
            s_state = ring_atom_conj_state.get(a_start)
            e_state = ring_atom_conj_state.get(a_end)
            if False in s_state and False in e_state:
                result += 1
        return result

    def n_non_ring_bonds_conj_nonconj_rings(self, mol: rdchem.Mol):
        """
        26. number of non-ring bonds connecting 2 rings,
        one of them conjugated and one non-conjugated
        """
        # EN: this is interpretated literally as bonds and not path

        ring_atom_conj_state = _ring_atom_state(mol)
        sm = dm.from_smarts("[R:1]!@[R:2]")
        bond_list = mol.GetSubstructMatches(sm, uniquify=True)
        result = 0
        for a_start, a_end in bond_list:
            s_state = ring_atom_conj_state.get(a_start)
            e_state = ring_atom_conj_state.get(a_end)
            if (True in s_state and False in e_state) or (False in s_state and True in e_state):
                result += 1
        return result

    def n_bonds_atoms_with_at_least_one_nei_with_2_conn(self, mol: rdchem.Mol):
        """
        27. number of bonds where both atoms have at least one neighbor
        (not considering the bond atoms) with more than 2 connections
        """
        result = 0
        huge_conn = list(
            itertools.chain(*mol.GetSubstructMatches(dm.from_smarts("[*;!D0;!D1;!D2]"), uniquify=1))
        )
        for bond in mol.GetBonds():
            a_start, a_end = bond.GetBeginAtom(), bond.GetEndAtom()
            # we need to exclud the bond atom themselves
            allowed_conn_table = [
                x for x in huge_conn if x not in [a_start.GetIdx(), a_end.GetIdx()]
            ]
            if any([x.GetIdx() in allowed_conn_table for x in a_start.GetNeighbors()]) and any(
                [y.GetIdx() in allowed_conn_table for y in a_end.GetNeighbors()]
            ):
                result += 1
        return result

    def n_simple_rings(self, mol: rdchem.Mol):
        """28. number of simple rings"""
        ri = mol.GetRingInfo()
        return ri.NumRings()

    def size_largest_ring(self, mol: rdchem.Mol):
        """29. Size of the largest ring"""
        ri = mol.GetRingInfo()
        max_ring_size = max((len(r) for r in ri.AtomRings()), default=0)
        return max_ring_size

    def n_simple_rings_no_heteroatoms(self, mol: rdchem.Mol):
        """30. number of simple rings with no heteroatoms"""
        ri = mol.GetRingInfo()
        n_heteros = _count_heteroatom_per_ring(mol, ri.AtomRings())
        return sum(1 for x in n_heteros if x == 0)

    def n_simple_rings_1_heteroatoms(self, mol: rdchem.Mol):
        """31. number of simple rings with 1 heteroatom"""
        ri = mol.GetRingInfo()
        n_heteros = _count_heteroatom_per_ring(mol, ri.AtomRings())
        return sum(1 for x in n_heteros if x == 1)

    def n_simple_rings_2_heteroatoms(self, mol: rdchem.Mol):
        """32. number of simple rings with 2 heteroatom"""
        ri = mol.GetRingInfo()
        n_heteros = _count_heteroatom_per_ring(mol, ri.AtomRings())
        return sum(1 for x in n_heteros if x == 2)

    def n_simple_rings_at_least_3_heteroatoms(self, mol: rdchem.Mol):
        """33. number of simple rings with 3 or more heteroatoms"""
        ri = mol.GetRingInfo()
        n_heteros = _count_heteroatom_per_ring(mol, ri.AtomRings())
        return sum(1 for x in n_heteros if x >= 3)

    def n_simple_non_conj_5_atoms_rings(self, mol: rdchem.Mol):
        """34. number of simple non-conjugated rings with 5 atoms"""
        ri = mol.GetRingInfo()
        n = 0
        for ring in ri.AtomRings():
            if not _is_ring_fully_conjugated(mol, ring) and len(ring) == 5:
                n += 1
        return n

    def n_simple_non_conj_6_atoms_rings(self, mol: rdchem.Mol):
        """35. number of simple non-conjugated rings with 6 atoms"""
        ri = mol.GetRingInfo()
        n = 0
        for ring in ri.AtomRings():
            if not _is_ring_fully_conjugated(mol, ring) and len(ring) == 6:
                n += 1
        return n

    def n_ring_system(self, mol: rdchem.Mol):
        """36. number of ring systems"""
        simple_rings, ring_system, _ = _get_ring_system(mol)
        return len(ring_system)

    def n_ring_system_with_2_non_conj_simple_ring(self, mol: rdchem.Mol):
        """37. number of rings systems with 2 non-conjugated simple rings"""
        simple_rings, _, ring_map = _get_ring_system(mol)
        conj_rings_map = dict(
            (i, _is_ring_fully_conjugated(mol, x)) for i, x in enumerate(simple_rings)
        )
        result = 0
        for ring_set in ring_map:
            n_not_conj = sum(not conj_rings_map[rnum] for rnum in ring_set)
            result += n_not_conj == 2
        return result

    def n_ring_system_with_2_conj_simple_ring(self, mol: rdchem.Mol):
        """38. number of rings systems with 2 conjugated simple rings"""
        simple_rings, _, ring_map = _get_ring_system(mol)
        conj_rings_map = dict(
            (i, _is_ring_fully_conjugated(mol, x)) for i, x in enumerate(simple_rings)
        )
        result = 0
        for ring_set in ring_map:
            n_conj = sum(conj_rings_map[rnum] for rnum in ring_set)
            result += n_conj == 2
        return result

    def n_ring_system_with_conj_non_conj_simple_ring(self, mol: rdchem.Mol):
        """39 number of ring system containing 2 simple rings, one conjugated and one nonconjugated"""
        simple_rings, _, ring_map = _get_ring_system(mol)
        conj_rings_map = dict(
            (i, _is_ring_fully_conjugated(mol, x)) for i, x in enumerate(simple_rings)
        )
        result = 0
        for ring_set in ring_map:
            if len(ring_set) == 2:
                n_conj = sum(conj_rings_map[rnum] for rnum in ring_set)
                result += n_conj == 1
        return result

    def n_ring_system_with_3_conj_simple_ring(self, mol: rdchem.Mol):
        """40. number of rings systems with 3 conjugated simple rings"""
        simple_rings, _, ring_map = _get_ring_system(mol)
        conj_rings_map = dict(
            (i, _is_ring_fully_conjugated(mol, x)) for i, x in enumerate(simple_rings)
        )
        result = 0
        for ring_set in ring_map:
            n_conj = sum(conj_rings_map[rnum] for rnum in ring_set)
            result += n_conj == 3
        return result

    def n_ring_system_with_3_non_conj_simple_ring(self, mol: rdchem.Mol):
        """41. number of rings systems with 3 non-conjugated simple rings"""
        simple_rings, _, ring_map = _get_ring_system(mol)
        conj_rings_map = dict(
            (i, _is_ring_fully_conjugated(mol, x)) for i, x in enumerate(simple_rings)
        )
        result = 0
        for ring_set in ring_map:
            n_not_conj = sum(not conj_rings_map[rnum] for rnum in ring_set)
            result += n_not_conj == 3
        return result

    def n_ring_system_with_greater_one_conj_nonconj_simple_ring(self, mol: rdchem.Mol):
        """42. number of ring system containing 3 simple rings, at least one conjugated and one nonconjugated"""
        simple_rings, _, ring_map = _get_ring_system(mol)
        conj_rings_map = dict(
            (i, _is_ring_fully_conjugated(mol, x)) for i, x in enumerate(simple_rings)
        )
        result = 0
        for ring_set in ring_map:
            if len(ring_set) == 3:
                n_conj = sum(conj_rings_map[rnum] for rnum in ring_set)
                result += n_conj in [1, 2]
        return result

    @property
    def columns(self):
        """Get the name of all the descriptors of this calculator"""
        return list(self.DESCRIPTORS)

    def __call__(self, mol: Union[rdchem.Mol, str]):
        r"""
        Compute the Fingerprint of a molecule

        Args:
            mol: the molecule of interest

        Returns:
            props (np.ndarray): list of computed rdkit molecular descriptors
        """
        mol = dm.to_mol(mol)
        if self.use_scaffold and mol is not None:
            mol = MurckoScaffold.GetScaffoldForMol(mol)

        props = []
        for k in self.DESCRIPTORS:
            try:
                fn = getattr(self, k)
                props.append(fn(mol))
            except Exception as e:
                if self.verbose:
                    logger.error(e)
                props.append(float("nan"))
        props = np.asarray(props)
        if self.normalize:
            return self.compute_normalization(props)
        return props

DESCRIPTORS = ['n_atom_in_rings', 'n_atom_in_conjugated_ring', 'n_atoms_not_in_conjugated_ring', 'n_atom_in_chain', 'n_atom_exocyclic', 'n_nitrogen', 'n_nitrogen_in_ring', 'n_oxygen', 'n_oxygen_in_ring', 'n_sulfur', 'n_heteroatoms', 'n_heteroatoms_in_ring', 'n_atom_spiro_atoms', 'n_heteroatom_more_than_2_conn', 'n_carbon_atleast_2_heteroatoms', 'n_atom_at_least_2_nei_more_than_2_conn', 'abs_scaffold_format_charge', 'n_bonds', 'n_multiple_non_conj_ring_bonds', 'n_bonds_2_heteroatoms', 'n_carbon_het_carbon_het_bonds', 'n_bonds_at_least_3_conn', 'n_exocyclic_single_bonds_carbon', 'n_exocyclic_single_bonds_nitrogen', 'n_non_ring_bonds_2_conj_rings', 'n_non_ring_bonds_conj_nonconj_rings', 'n_bonds_atoms_with_at_least_one_nei_with_2_conn', 'n_simple_rings', 'size_largest_ring', 'n_simple_rings_no_heteroatoms', 'n_simple_rings_1_heteroatoms', 'n_simple_rings_2_heteroatoms', 'n_simple_rings_at_least_3_heteroatoms', 'n_simple_non_conj_5_atoms_rings', 'n_simple_non_conj_6_atoms_rings', 'n_ring_system', 'n_ring_system_with_2_non_conj_simple_ring', 'n_ring_system_with_2_conj_simple_ring', 'n_ring_system_with_conj_non_conj_simple_ring', 'n_ring_system_with_3_conj_simple_ring', 'n_ring_system_with_3_non_conj_simple_ring', 'n_ring_system_with_greater_one_conj_nonconj_simple_ring'] `class-attribute` ¶

`NORM_PARAMS = pd.read_csv(Path(molfeat.file).parents[0].joinpath('data/skey_parameters.csv'), index_col=0).loc[DESCRIPTORS]` `class-attribute` ¶

`columns` `property` ¶

Get the name of all the descriptors of this calculator

`normalize = normalize` `instance-attribute` ¶

`use_scaffold = use_scaffold` `instance-attribute` ¶

`verbose = verbose` `instance-attribute` ¶

`call(mol)` ¶

Compute the Fingerprint of a molecule

Parameters:

Name	Type	Description	Default
`mol`	`Union[rdchem.Mol, str]`	the molecule of interest	required

Returns:

Name	Type	Description
`props`	`np.ndarray`	list of computed rdkit molecular descriptors

Source code in molfeat/calc/skeys.py

def __call__(self, mol: Union[rdchem.Mol, str]):
    r"""
    Compute the Fingerprint of a molecule

    Args:
        mol: the molecule of interest

    Returns:
        props (np.ndarray): list of computed rdkit molecular descriptors
    """
    mol = dm.to_mol(mol)
    if self.use_scaffold and mol is not None:
        mol = MurckoScaffold.GetScaffoldForMol(mol)

    props = []
    for k in self.DESCRIPTORS:
        try:
            fn = getattr(self, k)
            props.append(fn(mol))
        except Exception as e:
            if self.verbose:
                logger.error(e)
            props.append(float("nan"))
    props = np.asarray(props)
    if self.normalize:
        return self.compute_normalization(props)
    return props

`getstate()` ¶

Get state of the scaffold key function

Source code in molfeat/calc/skeys.py

def __getstate__(self):
    """Get state of the scaffold key function"""
    state = {}
    state["normalize"] = self.normalize
    state["verbose"] = self.verbose
    state["use_scaffold"] = self.use_scaffold
    return state

`init(normalize=False, verbose=False, use_scaffold=False, **kwargs)` ¶

Init of the scaffold key function

Parameters:

Name	Type	Description	Default
`normalize`	`bool`	whether to normalize the value of the feature	`False`
`verbose`	`bool`	whether to log errors	`False`
`use_scaffold`	`bool`	whether to convert the molecule into scaffold first	`False`

Source code in molfeat/calc/skeys.py

def __init__(
    self, normalize: bool = False, verbose: bool = False, use_scaffold: bool = False, **kwargs
):
    """
    Init of the scaffold key function

    Args:
        normalize: whether to normalize the value of the feature
        verbose: whether to log errors
        use_scaffold: whether to convert the molecule into scaffold first
    """
    self.normalize = normalize
    self.verbose = verbose
    self.use_scaffold = use_scaffold

`len()` ¶

Source code in molfeat/calc/skeys.py

def __len__(self):
    return len(self.DESCRIPTORS)

`abs_scaffold_format_charge(mol)` ¶

absolute value of the scaffold formal charge

Source code in molfeat/calc/skeys.py

def abs_scaffold_format_charge(self, mol: rdchem.Mol):
    """17. absolute value of the scaffold formal charge"""
    charge = GetFormalCharge(mol)
    return abs(charge)

`compute_normalization(features)` `classmethod` ¶

Normalize input features. The normalization parameters are computed by the scaffolds of 2.1M molecules from CHEMBL 29.

Source code in molfeat/calc/skeys.py

@classmethod
def compute_normalization(cls, features: np.ndarray):
    """Normalize input features. The normalization parameters are
    computed by the scaffolds of 2.1M molecules from CHEMBL 29.
    """
    return (features - cls.NORM_PARAMS["mean"]) / cls.NORM_PARAMS["std"]

`n_atom_at_least_2_nei_more_than_2_conn(mol)` ¶

Number of atoms where at least 2 connected atoms have more than 2 connections

Source code in molfeat/calc/skeys.py

def n_atom_at_least_2_nei_more_than_2_conn(self, mol: rdchem.Mol):
    """16. Number of atoms where at least 2 connected atoms have more than 2 connections"""
    n_atoms = 0
    for atom in mol.GetAtoms():
        tmp = [x for x in atom.GetNeighbors() if len(x.GetNeighbors()) > 2]
        n_atoms += len(tmp) > 2
    return n_atoms

`n_atom_exocyclic(mol)` ¶

number of exocyclic atoms (connected by multiple bonds to a ring)

Source code in molfeat/calc/skeys.py

def n_atom_exocyclic(self, mol: rdchem.Mol):
    """5. number of exocyclic atoms (connected by multiple bonds to a ring)"""
    sm = dm.from_smarts("[!r;!$(*-[r])&$(*~[r])]")
    return len(mol.GetSubstructMatches(sm, uniquify=True))

`n_atom_in_chain(mol)` ¶

number atoms in chains (not counting double-connected exo-chain atoms)

Source code in molfeat/calc/skeys.py

def n_atom_in_chain(self, mol: rdchem.Mol):
    """4. number atoms in chains (not counting double-connected exo-chain atoms)"""
    sm = dm.from_smarts("[!r;!$(*=[r])]")
    return len(mol.GetSubstructMatches(sm, uniquify=True))

`n_atom_in_conjugated_ring(mol)` ¶

number of atoms in conjugated rings

Source code in molfeat/calc/skeys.py

def n_atom_in_conjugated_ring(self, mol: rdchem.Mol):
    """2. number of atoms in conjugated rings"""
    ri = mol.GetRingInfo()
    n = 0
    for ring in ri.AtomRings():
        if _is_ring_fully_conjugated(mol, ring):
            n += len(ring)
    return n

`n_atom_in_rings(mol)` ¶

number of ring atoms

Source code in molfeat/calc/skeys.py

def n_atom_in_rings(self, mol: rdchem.Mol):
    """1. number of ring atoms"""
    sm = dm.from_smarts("[r]")
    return len(mol.GetSubstructMatches(sm, uniquify=True))

`n_atom_spiro_atoms(mol)` ¶

number of spiro atoms

Source code in molfeat/calc/skeys.py

def n_atom_spiro_atoms(self, mol: rdchem.Mol):
    """13. number of spiro atoms"""
    return Desc.CalcNumSpiroAtoms(mol)

`n_atoms_not_in_conjugated_ring(mol)` ¶

number of atoms not in conjugated rings (i.e. atoms in aliphatic rings and non-ring atoms)

Source code in molfeat/calc/skeys.py

def n_atoms_not_in_conjugated_ring(self, mol: rdchem.Mol):
    """
    3. number of atoms not in conjugated rings
    (i.e. atoms in aliphatic rings and non-ring atoms)
    """
    # EN: replace conjugation by aromatic
    ri = mol.GetRingInfo()
    n = 0
    for ring in ri.AtomRings():
        if not _is_ring_fully_conjugated(mol, ring):
            n += len(ring)
    return n

`n_bonds(mol)` ¶

number of bonds

Source code in molfeat/calc/skeys.py

def n_bonds(self, mol: rdchem.Mol):
    """18. number of bonds"""
    return mol.GetNumBonds()

`n_bonds_2_heteroatoms(mol)` ¶

number of bonds connecting 2 heteroatoms

Source code in molfeat/calc/skeys.py

def n_bonds_2_heteroatoms(self, mol: rdchem.Mol):
    """20. number of bonds connecting 2 heteroatoms"""
    sm = dm.from_smarts("[!#1&!#6]~[!#1&!#6]")
    return len(mol.GetSubstructMatches(sm, uniquify=True))

`n_bonds_at_least_3_conn(mol)` ¶

number of bonds with at least 3 connections on both its atoms

Source code in molfeat/calc/skeys.py

def n_bonds_at_least_3_conn(self, mol: rdchem.Mol):
    """22. number of bonds with at least 3 connections on both its atoms"""
    sm = dm.from_smarts("[$([!#1](~[!#1])(~[!#1])~[!#1])][$([!#1](~[!#1])(~[!#1])~[!#1])]")
    return len(mol.GetSubstructMatches(sm, uniquify=True))

`n_bonds_atoms_with_at_least_one_nei_with_2_conn(mol)` ¶

number of bonds where both atoms have at least one neighbor (not considering the bond atoms) with more than 2 connections

Source code in molfeat/calc/skeys.py

def n_bonds_atoms_with_at_least_one_nei_with_2_conn(self, mol: rdchem.Mol):
    """
    27. number of bonds where both atoms have at least one neighbor
    (not considering the bond atoms) with more than 2 connections
    """
    result = 0
    huge_conn = list(
        itertools.chain(*mol.GetSubstructMatches(dm.from_smarts("[*;!D0;!D1;!D2]"), uniquify=1))
    )
    for bond in mol.GetBonds():
        a_start, a_end = bond.GetBeginAtom(), bond.GetEndAtom()
        # we need to exclud the bond atom themselves
        allowed_conn_table = [
            x for x in huge_conn if x not in [a_start.GetIdx(), a_end.GetIdx()]
        ]
        if any([x.GetIdx() in allowed_conn_table for x in a_start.GetNeighbors()]) and any(
            [y.GetIdx() in allowed_conn_table for y in a_end.GetNeighbors()]
        ):
            result += 1
    return result

`n_carbon_atleast_2_heteroatoms(mol)` ¶

number of carbon atoms connected to at least 2 heteroatoms

Source code in molfeat/calc/skeys.py

def n_carbon_atleast_2_heteroatoms(self, mol: rdchem.Mol):
    """15. number of carbon atoms connected to at least 2 heteroatoms"""
    n_atoms = 0
    for atom in mol.GetAtoms():
        tmp = [x for x in atom.GetNeighbors() if x.GetAtomicNum() not in [1, 6]]
        n_atoms += len(tmp) >= 2
    return n_atoms

`n_carbon_het_carbon_het_bonds(mol)` ¶

number of bonds connecting 2 heteroatoms through 2 carbons

Source code in molfeat/calc/skeys.py

def n_carbon_het_carbon_het_bonds(self, mol: rdchem.Mol):
    """21. number of bonds connecting 2 heteroatoms through 2 carbons"""
    sm = dm.from_smarts("[!#1&!#6]~[#6]~[#6]~[!#1&!#6]")
    return len(mol.GetSubstructMatches(sm, uniquify=True))

`n_exocyclic_single_bonds_carbon(mol)` ¶

number of exocyclic single bonds where a ring atom is carbon

Source code in molfeat/calc/skeys.py

def n_exocyclic_single_bonds_carbon(self, mol: rdchem.Mol):
    """23. number of exocyclic single bonds where a ring atom is carbon"""
    sm = dm.from_smarts("[!R;!#1]-[#6;R]")
    return len(mol.GetSubstructMatches(sm, uniquify=True))

`n_exocyclic_single_bonds_nitrogen(mol)` ¶

number of exocyclic single bonds where a ring atom is nitrogen

Source code in molfeat/calc/skeys.py

def n_exocyclic_single_bonds_nitrogen(self, mol: rdchem.Mol):
    """24. number of exocyclic single bonds where a ring atom is nitrogen"""
    sm = dm.from_smarts("[!R;!#1]-[#7;R]")
    return len(mol.GetSubstructMatches(sm, uniquify=True))

`n_heteroatom_more_than_2_conn(mol)` ¶

number of heteroatoms with more than 2 connections

Source code in molfeat/calc/skeys.py

def n_heteroatom_more_than_2_conn(self, mol: rdchem.Mol):
    """14. number of heteroatoms with more than 2 connections"""
    sm = dm.from_smarts("[!#1;!#6;!D1!D0;!D2]")
    return len(mol.GetSubstructMatches(sm, uniquify=True))

`n_heteroatoms(mol)` ¶

number of heteroatoms

Source code in molfeat/calc/skeys.py

def n_heteroatoms(self, mol: rdchem.Mol):
    """11. number of heteroatoms"""

    sm = dm.from_smarts("[!#1&!#6]")
    return len(mol.GetSubstructMatches(sm, uniquify=True))

`n_heteroatoms_in_ring(mol)` ¶

number of heteroatoms in rings

Source code in molfeat/calc/skeys.py

def n_heteroatoms_in_ring(self, mol: rdchem.Mol):
    """12. number of heteroatoms in rings"""
    sm = dm.from_smarts("[!#1&!#6&r]")
    return len(mol.GetSubstructMatches(sm, uniquify=True))

`n_multiple_non_conj_ring_bonds(mol)` ¶

number of multiple, nonconjugated ring bonds

Source code in molfeat/calc/skeys.py

def n_multiple_non_conj_ring_bonds(self, mol: rdchem.Mol):
    """19. number of multiple, nonconjugated ring bonds"""
    extracted_rings = []
    nr_multiple_bonds_infcr = 0  # infcr: in not fully conjugated ring
    rings = Chem.GetSymmSSSR(mol)
    for i in range(len(rings)):
        extracted_rings.append(list(rings[i]))
    for ring in extracted_rings:
        if not _is_ring_fully_conjugated(mol, ring):
            nr_multiple_bonds_infcr += _n_multiple_bond_in_ring(mol, ring)
    return nr_multiple_bonds_infcr

`n_nitrogen(mol)` ¶

number of nitrogen

Source code in molfeat/calc/skeys.py

def n_nitrogen(self, mol: rdchem.Mol):
    """6. number of nitrogen"""
    sm = dm.from_smarts("[#7]")
    return len(mol.GetSubstructMatches(sm, uniquify=True))

`n_nitrogen_in_ring(mol)` ¶

number of nitrogen in rings

Source code in molfeat/calc/skeys.py

def n_nitrogen_in_ring(self, mol: rdchem.Mol):
    """7. number of nitrogen in rings"""
    sm = dm.from_smarts("[#7;r]")
    return len(mol.GetSubstructMatches(sm, uniquify=True))

`n_non_ring_bonds_2_conj_rings(mol)` ¶

number of non-ring bonds connecting 2 nonconjugated rings

Source code in molfeat/calc/skeys.py

def n_non_ring_bonds_2_conj_rings(self, mol: rdchem.Mol):
    """25. number of non-ring bonds connecting 2 nonconjugated rings"""
    # EN: this is interpretated literally as bonds and not path
    ring_atom_conj_state = _ring_atom_state(mol)
    sm = dm.from_smarts("[R:1]!@[R:2]")
    bond_list = mol.GetSubstructMatches(sm, uniquify=True)
    result = 0
    for a_start, a_end in bond_list:
        s_state = ring_atom_conj_state.get(a_start)
        e_state = ring_atom_conj_state.get(a_end)
        if False in s_state and False in e_state:
            result += 1
    return result

`n_non_ring_bonds_conj_nonconj_rings(mol)` ¶

number of non-ring bonds connecting 2 rings, one of them conjugated and one non-conjugated

Source code in molfeat/calc/skeys.py

def n_non_ring_bonds_conj_nonconj_rings(self, mol: rdchem.Mol):
    """
    26. number of non-ring bonds connecting 2 rings,
    one of them conjugated and one non-conjugated
    """
    # EN: this is interpretated literally as bonds and not path

    ring_atom_conj_state = _ring_atom_state(mol)
    sm = dm.from_smarts("[R:1]!@[R:2]")
    bond_list = mol.GetSubstructMatches(sm, uniquify=True)
    result = 0
    for a_start, a_end in bond_list:
        s_state = ring_atom_conj_state.get(a_start)
        e_state = ring_atom_conj_state.get(a_end)
        if (True in s_state and False in e_state) or (False in s_state and True in e_state):
            result += 1
    return result

`n_oxygen(mol)` ¶

number of oxygen

Source code in molfeat/calc/skeys.py

def n_oxygen(self, mol: rdchem.Mol):
    """8. number of oxygen"""
    sm = dm.from_smarts("[#8]")
    return len(mol.GetSubstructMatches(sm, uniquify=True))

`n_oxygen_in_ring(mol)` ¶

number of oxygen in rings

Source code in molfeat/calc/skeys.py

def n_oxygen_in_ring(self, mol: rdchem.Mol):
    """9. number of oxygen in rings"""
    sm = dm.from_smarts("[#8]")
    return len(mol.GetSubstructMatches(sm, uniquify=True))

`n_ring_system(mol)` ¶

number of ring systems

Source code in molfeat/calc/skeys.py

def n_ring_system(self, mol: rdchem.Mol):
    """36. number of ring systems"""
    simple_rings, ring_system, _ = _get_ring_system(mol)
    return len(ring_system)

`n_ring_system_with_2_conj_simple_ring(mol)` ¶

number of rings systems with 2 conjugated simple rings

Source code in molfeat/calc/skeys.py

def n_ring_system_with_2_conj_simple_ring(self, mol: rdchem.Mol):
    """38. number of rings systems with 2 conjugated simple rings"""
    simple_rings, _, ring_map = _get_ring_system(mol)
    conj_rings_map = dict(
        (i, _is_ring_fully_conjugated(mol, x)) for i, x in enumerate(simple_rings)
    )
    result = 0
    for ring_set in ring_map:
        n_conj = sum(conj_rings_map[rnum] for rnum in ring_set)
        result += n_conj == 2
    return result

`n_ring_system_with_2_non_conj_simple_ring(mol)` ¶

number of rings systems with 2 non-conjugated simple rings

Source code in molfeat/calc/skeys.py

def n_ring_system_with_2_non_conj_simple_ring(self, mol: rdchem.Mol):
    """37. number of rings systems with 2 non-conjugated simple rings"""
    simple_rings, _, ring_map = _get_ring_system(mol)
    conj_rings_map = dict(
        (i, _is_ring_fully_conjugated(mol, x)) for i, x in enumerate(simple_rings)
    )
    result = 0
    for ring_set in ring_map:
        n_not_conj = sum(not conj_rings_map[rnum] for rnum in ring_set)
        result += n_not_conj == 2
    return result

`n_ring_system_with_3_conj_simple_ring(mol)` ¶

number of rings systems with 3 conjugated simple rings

Source code in molfeat/calc/skeys.py

def n_ring_system_with_3_conj_simple_ring(self, mol: rdchem.Mol):
    """40. number of rings systems with 3 conjugated simple rings"""
    simple_rings, _, ring_map = _get_ring_system(mol)
    conj_rings_map = dict(
        (i, _is_ring_fully_conjugated(mol, x)) for i, x in enumerate(simple_rings)
    )
    result = 0
    for ring_set in ring_map:
        n_conj = sum(conj_rings_map[rnum] for rnum in ring_set)
        result += n_conj == 3
    return result

`n_ring_system_with_3_non_conj_simple_ring(mol)` ¶

number of rings systems with 3 non-conjugated simple rings

Source code in molfeat/calc/skeys.py

def n_ring_system_with_3_non_conj_simple_ring(self, mol: rdchem.Mol):
    """41. number of rings systems with 3 non-conjugated simple rings"""
    simple_rings, _, ring_map = _get_ring_system(mol)
    conj_rings_map = dict(
        (i, _is_ring_fully_conjugated(mol, x)) for i, x in enumerate(simple_rings)
    )
    result = 0
    for ring_set in ring_map:
        n_not_conj = sum(not conj_rings_map[rnum] for rnum in ring_set)
        result += n_not_conj == 3
    return result

`n_ring_system_with_conj_non_conj_simple_ring(mol)` ¶

39 number of ring system containing 2 simple rings, one conjugated and one nonconjugated

Source code in molfeat/calc/skeys.py

def n_ring_system_with_conj_non_conj_simple_ring(self, mol: rdchem.Mol):
    """39 number of ring system containing 2 simple rings, one conjugated and one nonconjugated"""
    simple_rings, _, ring_map = _get_ring_system(mol)
    conj_rings_map = dict(
        (i, _is_ring_fully_conjugated(mol, x)) for i, x in enumerate(simple_rings)
    )
    result = 0
    for ring_set in ring_map:
        if len(ring_set) == 2:
            n_conj = sum(conj_rings_map[rnum] for rnum in ring_set)
            result += n_conj == 1
    return result

`n_ring_system_with_greater_one_conj_nonconj_simple_ring(mol)` ¶

number of ring system containing 3 simple rings, at least one conjugated and one nonconjugated

Source code in molfeat/calc/skeys.py

def n_ring_system_with_greater_one_conj_nonconj_simple_ring(self, mol: rdchem.Mol):
    """42. number of ring system containing 3 simple rings, at least one conjugated and one nonconjugated"""
    simple_rings, _, ring_map = _get_ring_system(mol)
    conj_rings_map = dict(
        (i, _is_ring_fully_conjugated(mol, x)) for i, x in enumerate(simple_rings)
    )
    result = 0
    for ring_set in ring_map:
        if len(ring_set) == 3:
            n_conj = sum(conj_rings_map[rnum] for rnum in ring_set)
            result += n_conj in [1, 2]
    return result

`n_simple_non_conj_5_atoms_rings(mol)` ¶

number of simple non-conjugated rings with 5 atoms

Source code in molfeat/calc/skeys.py

def n_simple_non_conj_5_atoms_rings(self, mol: rdchem.Mol):
    """34. number of simple non-conjugated rings with 5 atoms"""
    ri = mol.GetRingInfo()
    n = 0
    for ring in ri.AtomRings():
        if not _is_ring_fully_conjugated(mol, ring) and len(ring) == 5:
            n += 1
    return n

`n_simple_non_conj_6_atoms_rings(mol)` ¶

number of simple non-conjugated rings with 6 atoms

Source code in molfeat/calc/skeys.py

def n_simple_non_conj_6_atoms_rings(self, mol: rdchem.Mol):
    """35. number of simple non-conjugated rings with 6 atoms"""
    ri = mol.GetRingInfo()
    n = 0
    for ring in ri.AtomRings():
        if not _is_ring_fully_conjugated(mol, ring) and len(ring) == 6:
            n += 1
    return n

`n_simple_rings(mol)` ¶

number of simple rings

Source code in molfeat/calc/skeys.py

def n_simple_rings(self, mol: rdchem.Mol):
    """28. number of simple rings"""
    ri = mol.GetRingInfo()
    return ri.NumRings()

`n_simple_rings_1_heteroatoms(mol)` ¶

number of simple rings with 1 heteroatom

Source code in molfeat/calc/skeys.py

def n_simple_rings_1_heteroatoms(self, mol: rdchem.Mol):
    """31. number of simple rings with 1 heteroatom"""
    ri = mol.GetRingInfo()
    n_heteros = _count_heteroatom_per_ring(mol, ri.AtomRings())
    return sum(1 for x in n_heteros if x == 1)

`n_simple_rings_2_heteroatoms(mol)` ¶

number of simple rings with 2 heteroatom

Source code in molfeat/calc/skeys.py

def n_simple_rings_2_heteroatoms(self, mol: rdchem.Mol):
    """32. number of simple rings with 2 heteroatom"""
    ri = mol.GetRingInfo()
    n_heteros = _count_heteroatom_per_ring(mol, ri.AtomRings())
    return sum(1 for x in n_heteros if x == 2)

`n_simple_rings_at_least_3_heteroatoms(mol)` ¶

number of simple rings with 3 or more heteroatoms

Source code in molfeat/calc/skeys.py

def n_simple_rings_at_least_3_heteroatoms(self, mol: rdchem.Mol):
    """33. number of simple rings with 3 or more heteroatoms"""
    ri = mol.GetRingInfo()
    n_heteros = _count_heteroatom_per_ring(mol, ri.AtomRings())
    return sum(1 for x in n_heteros if x >= 3)

`n_simple_rings_no_heteroatoms(mol)` ¶

number of simple rings with no heteroatoms

Source code in molfeat/calc/skeys.py

def n_simple_rings_no_heteroatoms(self, mol: rdchem.Mol):
    """30. number of simple rings with no heteroatoms"""
    ri = mol.GetRingInfo()
    n_heteros = _count_heteroatom_per_ring(mol, ri.AtomRings())
    return sum(1 for x in n_heteros if x == 0)

`n_sulfur(mol)` ¶

number of sulfur atoms

Source code in molfeat/calc/skeys.py

def n_sulfur(self, mol: rdchem.Mol):
    """10. number of sulfur atoms"""
    sm = dm.from_smarts("[#16]")
    return len(mol.GetSubstructMatches(sm, uniquify=True))

`size_largest_ring(mol)` ¶

Size of the largest ring

Source code in molfeat/calc/skeys.py

def size_largest_ring(self, mol: rdchem.Mol):
    """29. Size of the largest ring"""
    ri = mol.GetRingInfo()
    max_ring_size = max((len(r) for r in ri.AtomRings()), default=0)
    return max_ring_size

`skdistance(sk1, sk2, weights=None, cdist=False)` ¶

Compute the scaffold distance between two scaffold keys as described in https://pubs.acs.org/doi/abs/10.1021/ci5001983. The input features are expected to be normalized beforehand (see paper)

Parameters:

Name	Type	Description	Default
`sk1`	`np.ndarray`	scaffold key 1	required
`sk2`	`np.ndarray`	scaffold key 2	required
`weights`	`Optional[np.ndarray]`	how to weight each of the features. By default rank ordering is used.	`None`
`cdist`	`bool`	whether to compute the features on a batched of inputs (expected 2D)	`False`

Returns:

Name	Type	Description
`dist`	`float`	distance between two scaffold keys

Source code in molfeat/calc/skeys.py

def skdistance(
    sk1: np.ndarray,
    sk2: np.ndarray,
    weights: Optional[np.ndarray] = None,
    cdist: bool = False,
):
    """Compute the scaffold distance between two scaffold keys
    as described in https://pubs.acs.org/doi/abs/10.1021/ci5001983.
    The input features are expected to be normalized beforehand (see paper)

    Args:
        sk1: scaffold key 1
        sk2: scaffold key 2
        weights: how to weight each of the features. By default rank ordering is used.
        cdist: whether to compute the features on a batched of inputs (expected 2D)

    Returns:
        dist (float): distance between two scaffold keys
    """
    if weights is None:
        weights = 1 / (np.arange(sk1.shape[-1]) + 1)

    if cdist:
        sk1 = np.atleast_2d(sk1)
        sk2 = np.atleast_2d(sk2)
        val = np.abs(sk1[:, None] - sk2[:]) ** 1.5
        dist = np.sum(val * weights, axis=-1)
    else:
        if any((sk.ndim > 1 and sk.shape[0] != 1) for sk in [sk1, sk2]):
            raise ValueError("`cdist` mode was not detected, you need to provide single vectors")
        val = np.abs(sk1 - sk2) ** 1.5
        dist = np.sum(val * weights)
    return dist

`Shape`¶

`ElectroShapeDescriptors` ¶

Bases: SerializableCalculator

Compute Electroshape descriptors as described by

Armstrong et al. ElectroShape: fast molecular similarity calculations incorporating shape, chirality and electrostatics. J Comput Aided Mol Des 24, 789-801 (2010). http://dx.doi.org/doi:10.1007/s10822-010-9374-0

Source code in molfeat/calc/shape.py

class ElectroShapeDescriptors(SerializableCalculator):
    """Compute Electroshape descriptors as described by

    Armstrong et al. ElectroShape: fast molecular similarity calculations incorporating shape, chirality and electrostatics.
    J Comput Aided Mol Des 24, 789-801 (2010). http://dx.doi.org/doi:10.1007/s10822-010-9374-0
    """

    SUPPORTED_CHARGE_MODELS = ["gasteiger", "tripos", "mmff94", "formal"]

    def __init__(
        self,
        charge_model: str = "gasteiger",
        replace_nan: bool = False,
        electron_scaling: float = 25.0,
        **kwargs,
    ):
        """Constructor for ElectroShape descriptor

        Args:
            charge_model: charge model to use. One of ('gasteiger', 'tripos', 'mmff94', 'formal'). Defaults to "gasteiger".
                Note that formal charges will be computed on the fly if not provided in the input molecules.
                The `tripos` charge models comes from TRIPOS force field and are often parsed from mol2 files.
            replace_nan: whether to replace NaN values. Defaults False
            electron_scaling: scaling factor to convert electron charges to Angstroms. Defaults to 25.0.
        """

        self.charge_model = charge_model
        self.replace_nan = replace_nan
        self.electron_scaling = electron_scaling
        self._columns = None

    @property
    def columns(self):
        """
        Get the name of all the descriptors of this calculator
        """
        if self._columns is None:
            self._columns = []
            for i in range(1, 6):
                self._columns.extend([f"dist-{i}-mean", f"dist-{i}-std", f"dist-{i}-crb"])

        return self._columns

    def __getstate__(self):
        state = {}
        state["charge_model"] = self.charge_model
        state["replace_nan"] = self.replace_nan
        state["electron_scaling"] = self.electron_scaling
        state["_columns"] = self._columns
        return state

    def __len__(self):
        """Return the length of the calculator"""
        return len(self.columns)

    @staticmethod
    def compute_charge(mol: Union[dm.Mol, str], charge_model: str = None):
        """
        Get the molecular charge of the molecule.

        Args:
            charge_model: charge model to use. One of ('gasteiger', 'tripos', 'mmff94', 'formal'). Defaults to "gasteiger".
        """

        if charge_model not in ElectroShapeDescriptors.SUPPORTED_CHARGE_MODELS:
            raise ValueError(
                f"Unknown charge model {charge_model}. You should provide one of {ElectroShapeDescriptors.SUPPORTED_CHARGE_MODELS}"
            )
        mol = dm.to_mol(mol)
        atom_charge = []
        atom_list = list(mol.GetAtoms())

        # force compute the partial charges if not provided
        if charge_model == "gasteiger" and not atom_list[0].HasProp("_GasteigerCharge"):
            rdPartialCharges.ComputeGasteigerCharges(mol)
        elif charge_model == "mmff94" and not atom_list[0].HasProp("_MMFF94Charge"):
            ff_infos = rdForceFieldHelpers.MMFFGetMoleculeProperties(mol)
            for i, atom in enumerate(atom_list):
                atom.SetDoubleProp("_MMFF94Charge", ff_infos.GetMMFFPartialCharge(i))

        for atom in mol.GetAtoms():
            if charge_model == "formal":
                atom_charge.append(atom.GetFormalCharge())
            elif charge_model == "gasteiger":
                atom_charge.append(atom.GetDoubleProp("_GasteigerCharge"))
            elif charge_model == "mmff94":
                atom_charge.append(atom.GetDoubleProp("_MMFF94Charge"))
            elif charge_model == "tripos":
                atom_charge.append(atom.GetDoubleProp("_TriposPartialCharge"))
        return np.asarray(atom_charge)

    @requires_conformer
    def __call__(self, mol: Union[dm.Mol, str], conformer_id: Optional[int] = -1):
        r"""
        Get rdkit 3D descriptors for a molecule

        Args:
            mol: the molecule of interest
            conformer_id (int, optional): Optional conformer id. Defaults to -1.

        Returns:
            shape_descriptor (np.ndarray): computed shape descriptor
        """

        mol = dm.to_mol(mol)
        coords = mol.GetConformer(conformer_id).GetPositions()
        charge = self.compute_charge(mol, self.charge_model)
        if self.replace_nan:
            charge = np.nan_to_num(charge)

        desc_4d = np.column_stack((coords, charge * self.electron_scaling))

        c1 = desc_4d.mean(axis=0)
        distances_c1 = norm(desc_4d - c1, axis=1)

        c2 = desc_4d[distances_c1.argmax()]  # atom position furthest from c1
        distances_c2 = norm(desc_4d - c2, axis=1)

        c3 = desc_4d[distances_c2.argmax()]  # atom position furthest from c2
        distances_c3 = norm(desc_4d - c3, axis=1)

        vector_a = c2 - c1
        vector_b = c3 - c1
        vector_as = vector_a[:3]  # spatial parts of these vectors
        vector_bs = vector_b[:3]  # spatial parts of these vectors
        cross_ab = np.cross(vector_as, vector_bs)
        vector_c = (norm(vector_a) / (2 * norm(cross_ab))) * cross_ab
        vector_c1s = c1[:3]

        max_charge = np.array(np.amax(charge) * self.electron_scaling)
        min_charge = np.array(np.amin(charge) * self.electron_scaling)

        c4 = np.append(vector_c1s + vector_c, max_charge)
        c5 = np.append(vector_c1s + vector_c, min_charge)

        distances_c4 = norm(desc_4d - c4, axis=1)
        distances_c5 = norm(desc_4d - c5, axis=1)

        distances_list = [
            distances_c1,
            distances_c2,
            distances_c3,
            distances_c4,
            distances_c5,
        ]

        shape_descriptor = np.zeros(15)

        i = 0
        for distances in distances_list:
            mean = np.mean(distances)
            shape_descriptor[0 + i] = mean
            shape_descriptor[1 + i] = np.std(distances)
            shape_descriptor[2 + i] = cbrt(np.sum(((distances - mean) ** 3) / distances.size))
            i += 3
        if self.replace_nan:
            return np.nan_to_num(shape_descriptor)
        return shape_descriptor

`SUPPORTED_CHARGE_MODELS = ['gasteiger', 'tripos', 'mmff94', 'formal']` `class-attribute` ¶

`charge_model = charge_model` `instance-attribute` ¶

`columns` `property` ¶

Get the name of all the descriptors of this calculator

`electron_scaling = electron_scaling` `instance-attribute` ¶

`replace_nan = replace_nan` `instance-attribute` ¶

`call(mol, conformer_id=-1)` ¶

Get rdkit 3D descriptors for a molecule

Parameters:

Name	Type	Description	Default
`mol`	`Union[dm.Mol, str]`	the molecule of interest	required
`conformer_id`	`int`	Optional conformer id. Defaults to -1.	`-1`

Returns:

Name	Type	Description
`shape_descriptor`	`np.ndarray`	computed shape descriptor

Source code in molfeat/calc/shape.py

@requires_conformer
def __call__(self, mol: Union[dm.Mol, str], conformer_id: Optional[int] = -1):
    r"""
    Get rdkit 3D descriptors for a molecule

    Args:
        mol: the molecule of interest
        conformer_id (int, optional): Optional conformer id. Defaults to -1.

    Returns:
        shape_descriptor (np.ndarray): computed shape descriptor
    """

    mol = dm.to_mol(mol)
    coords = mol.GetConformer(conformer_id).GetPositions()
    charge = self.compute_charge(mol, self.charge_model)
    if self.replace_nan:
        charge = np.nan_to_num(charge)

    desc_4d = np.column_stack((coords, charge * self.electron_scaling))

    c1 = desc_4d.mean(axis=0)
    distances_c1 = norm(desc_4d - c1, axis=1)

    c2 = desc_4d[distances_c1.argmax()]  # atom position furthest from c1
    distances_c2 = norm(desc_4d - c2, axis=1)

    c3 = desc_4d[distances_c2.argmax()]  # atom position furthest from c2
    distances_c3 = norm(desc_4d - c3, axis=1)

    vector_a = c2 - c1
    vector_b = c3 - c1
    vector_as = vector_a[:3]  # spatial parts of these vectors
    vector_bs = vector_b[:3]  # spatial parts of these vectors
    cross_ab = np.cross(vector_as, vector_bs)
    vector_c = (norm(vector_a) / (2 * norm(cross_ab))) * cross_ab
    vector_c1s = c1[:3]

    max_charge = np.array(np.amax(charge) * self.electron_scaling)
    min_charge = np.array(np.amin(charge) * self.electron_scaling)

    c4 = np.append(vector_c1s + vector_c, max_charge)
    c5 = np.append(vector_c1s + vector_c, min_charge)

    distances_c4 = norm(desc_4d - c4, axis=1)
    distances_c5 = norm(desc_4d - c5, axis=1)

    distances_list = [
        distances_c1,
        distances_c2,
        distances_c3,
        distances_c4,
        distances_c5,
    ]

    shape_descriptor = np.zeros(15)

    i = 0
    for distances in distances_list:
        mean = np.mean(distances)
        shape_descriptor[0 + i] = mean
        shape_descriptor[1 + i] = np.std(distances)
        shape_descriptor[2 + i] = cbrt(np.sum(((distances - mean) ** 3) / distances.size))
        i += 3
    if self.replace_nan:
        return np.nan_to_num(shape_descriptor)
    return shape_descriptor

`getstate()` ¶

Source code in molfeat/calc/shape.py

def __getstate__(self):
    state = {}
    state["charge_model"] = self.charge_model
    state["replace_nan"] = self.replace_nan
    state["electron_scaling"] = self.electron_scaling
    state["_columns"] = self._columns
    return state

`init(charge_model='gasteiger', replace_nan=False, electron_scaling=25.0, **kwargs)` ¶

Constructor for ElectroShape descriptor

Parameters:

Name	Type	Description	Default
`charge_model`	`str`	charge model to use. One of ('gasteiger', 'tripos', 'mmff94', 'formal'). Defaults to "gasteiger". Note that formal charges will be computed on the fly if not provided in the input molecules. The `tripos` charge models comes from TRIPOS force field and are often parsed from mol2 files.	`'gasteiger'`
`replace_nan`	`bool`	whether to replace NaN values. Defaults False	`False`
`electron_scaling`	`float`	scaling factor to convert electron charges to Angstroms. Defaults to 25.0.	`25.0`

Source code in molfeat/calc/shape.py

def __init__(
    self,
    charge_model: str = "gasteiger",
    replace_nan: bool = False,
    electron_scaling: float = 25.0,
    **kwargs,
):
    """Constructor for ElectroShape descriptor

    Args:
        charge_model: charge model to use. One of ('gasteiger', 'tripos', 'mmff94', 'formal'). Defaults to "gasteiger".
            Note that formal charges will be computed on the fly if not provided in the input molecules.
            The `tripos` charge models comes from TRIPOS force field and are often parsed from mol2 files.
        replace_nan: whether to replace NaN values. Defaults False
        electron_scaling: scaling factor to convert electron charges to Angstroms. Defaults to 25.0.
    """

    self.charge_model = charge_model
    self.replace_nan = replace_nan
    self.electron_scaling = electron_scaling
    self._columns = None

`len()` ¶

Return the length of the calculator

Source code in molfeat/calc/shape.py

def __len__(self):
    """Return the length of the calculator"""
    return len(self.columns)

`compute_charge(mol, charge_model=None)` `staticmethod` ¶

Get the molecular charge of the molecule.

Parameters:

Name	Type	Description	Default
`charge_model`	`str`	charge model to use. One of ('gasteiger', 'tripos', 'mmff94', 'formal'). Defaults to "gasteiger".	`None`

Source code in molfeat/calc/shape.py

@staticmethod
def compute_charge(mol: Union[dm.Mol, str], charge_model: str = None):
    """
    Get the molecular charge of the molecule.

    Args:
        charge_model: charge model to use. One of ('gasteiger', 'tripos', 'mmff94', 'formal'). Defaults to "gasteiger".
    """

    if charge_model not in ElectroShapeDescriptors.SUPPORTED_CHARGE_MODELS:
        raise ValueError(
            f"Unknown charge model {charge_model}. You should provide one of {ElectroShapeDescriptors.SUPPORTED_CHARGE_MODELS}"
        )
    mol = dm.to_mol(mol)
    atom_charge = []
    atom_list = list(mol.GetAtoms())

    # force compute the partial charges if not provided
    if charge_model == "gasteiger" and not atom_list[0].HasProp("_GasteigerCharge"):
        rdPartialCharges.ComputeGasteigerCharges(mol)
    elif charge_model == "mmff94" and not atom_list[0].HasProp("_MMFF94Charge"):
        ff_infos = rdForceFieldHelpers.MMFFGetMoleculeProperties(mol)
        for i, atom in enumerate(atom_list):
            atom.SetDoubleProp("_MMFF94Charge", ff_infos.GetMMFFPartialCharge(i))

    for atom in mol.GetAtoms():
        if charge_model == "formal":
            atom_charge.append(atom.GetFormalCharge())
        elif charge_model == "gasteiger":
            atom_charge.append(atom.GetDoubleProp("_GasteigerCharge"))
        elif charge_model == "mmff94":
            atom_charge.append(atom.GetDoubleProp("_MMFF94Charge"))
        elif charge_model == "tripos":
            atom_charge.append(atom.GetDoubleProp("_TriposPartialCharge"))
    return np.asarray(atom_charge)

`USRDescriptors` ¶

Bases: SerializableCalculator

Descriptors for the shape of a molecule.

!!! note: The following shape descriptors are offered: * USR: UltraFast Shape Recognition * USRCAT: Ultrafast Shape Recognition with CREDO Atom Types

Source code in molfeat/calc/shape.py

class USRDescriptors(SerializableCalculator):
    """Descriptors for the shape of a molecule.

    !!! note:
        The following shape descriptors are offered:
            * USR: UltraFast Shape Recognition
            * USRCAT: Ultrafast Shape Recognition with CREDO Atom Types
    """

    def __init__(self, method: str = "USR", replace_nan: bool = False, **kwargs):
        """Constructor for ShapeDescriptors

        Args:
            method: Shape descriptor method to use. One of 'USR', 'USRCAT'. Default to 'USR'
            replace_nan: Whether to replace nan or infinite values. Defaults to False.
            augment: Whether to augment the descriptors with some additional custom features
            descrs: List of features to consider if not None
        """
        self.method = method.upper()
        if self.method not in ["USR", "USRCAT"]:
            raise ValueError(f"Shape descriptor {self.method} is not supported")
        self.replace_nan = replace_nan
        self._columns = None

    def __getstate__(self):
        state = {}
        state["method"] = self.method
        state["replace_nan"] = self.replace_nan
        state["_columns"] = self._columns
        return state

    @property
    def columns(self):
        """
        Get the name of all the descriptors of this calculator
        """
        if self._columns is None:
            if self.method == "USR":
                self._columns = [f"usr-{i}" for i in range(1, 13)]
            elif self.method == "USRCAT":
                self._columns = [f"usr-{i}" for i in range(1, 61)]
        return self._columns

    def __len__(self):
        """Compute descriptors length"""
        return len(self.columns)

    @requires_conformer
    def __call__(self, mol: Union[dm.Mol, str], conformer_id: Optional[int] = -1):
        r"""
        Get rdkit 3D descriptors for a molecule

        Args:
            mol: the molecule of interest
            conformer_id (int, optional): Optional conformer id. Defaults to -1.

        Returns:
            shape_descriptors (np.ndarray): list of computed mordred molecular descriptors
        """
        if self.method == "USR":
            shape_descr = rdMolDescriptors.GetUSR(mol, confId=conformer_id)
        elif self.method == "USRCAT":
            shape_descr = rdMolDescriptors.GetUSRCAT(mol, confId=conformer_id)
        if self.replace_nan:
            shape_descr = np.nan_to_num(shape_descr, self.replace_nan)
        return np.asarray(shape_descr)

`columns` `property` ¶

Get the name of all the descriptors of this calculator

`method = method.upper()` `instance-attribute` ¶

`replace_nan = replace_nan` `instance-attribute` ¶

`call(mol, conformer_id=-1)` ¶

Get rdkit 3D descriptors for a molecule

Parameters:

Name	Type	Description	Default
`mol`	`Union[dm.Mol, str]`	the molecule of interest	required
`conformer_id`	`int`	Optional conformer id. Defaults to -1.	`-1`

Returns:

Name	Type	Description
`shape_descriptors`	`np.ndarray`	list of computed mordred molecular descriptors

Source code in molfeat/calc/shape.py

@requires_conformer
def __call__(self, mol: Union[dm.Mol, str], conformer_id: Optional[int] = -1):
    r"""
    Get rdkit 3D descriptors for a molecule

    Args:
        mol: the molecule of interest
        conformer_id (int, optional): Optional conformer id. Defaults to -1.

    Returns:
        shape_descriptors (np.ndarray): list of computed mordred molecular descriptors
    """
    if self.method == "USR":
        shape_descr = rdMolDescriptors.GetUSR(mol, confId=conformer_id)
    elif self.method == "USRCAT":
        shape_descr = rdMolDescriptors.GetUSRCAT(mol, confId=conformer_id)
    if self.replace_nan:
        shape_descr = np.nan_to_num(shape_descr, self.replace_nan)
    return np.asarray(shape_descr)

`getstate()` ¶

Source code in molfeat/calc/shape.py

def __getstate__(self):
    state = {}
    state["method"] = self.method
    state["replace_nan"] = self.replace_nan
    state["_columns"] = self._columns
    return state

`init(method='USR', replace_nan=False, **kwargs)` ¶

Constructor for ShapeDescriptors

Parameters:

Name	Type	Description	Default
`method`	`str`	Shape descriptor method to use. One of 'USR', 'USRCAT'. Default to 'USR'	`'USR'`
`replace_nan`	`bool`	Whether to replace nan or infinite values. Defaults to False.	`False`
`augment`		Whether to augment the descriptors with some additional custom features	required
`descrs`		List of features to consider if not None	required

Source code in molfeat/calc/shape.py

def __init__(self, method: str = "USR", replace_nan: bool = False, **kwargs):
    """Constructor for ShapeDescriptors

    Args:
        method: Shape descriptor method to use. One of 'USR', 'USRCAT'. Default to 'USR'
        replace_nan: Whether to replace nan or infinite values. Defaults to False.
        augment: Whether to augment the descriptors with some additional custom features
        descrs: List of features to consider if not None
    """
    self.method = method.upper()
    if self.method not in ["USR", "USRCAT"]:
        raise ValueError(f"Shape descriptor {self.method} is not supported")
    self.replace_nan = replace_nan
    self._columns = None

`len()` ¶

Compute descriptors length

Source code in molfeat/calc/shape.py

def __len__(self):
    """Compute descriptors length"""
    return len(self.columns)

`usrdistance(shape_1, shape_2, weights=None)` ¶

Computes similarity between molecules

Parameters:

Name	Type	Description	Default
`shape_1`		USR shape descriptor of first molecule	required
`shape_2`		USR shape descriptor	required
`weights`	`Optional[List[float]]`	List of scaling factor to use for	`None`

Returns:

Name	Type	Description
`dist`		Distance [0-1] between shapes of molecules, 0 indicates identical molecules

Source code in molfeat/calc/shape.py

def usrdistance(
    shape_1,
    shape_2,
    weights: Optional[List[float]] = None,
):
    """Computes similarity between molecules

    Args:
        shape_1: USR shape descriptor of first molecule
        shape_2: USR shape descriptor
        weights: List of scaling factor to use for

    Returns:
        dist: Distance [0-1] between shapes of molecules, 0 indicates identical molecules
    """

    # case for usr shape descriptors
    if weights is None:
        weights = []
    if (
        (shape_1.shape[-1] == shape_2.shape[-1] == 12)
        or (shape_1.shape[-1] == shape_2.shape[-1] == 60)
        or (shape_1.shape[-1] == shape_2.shape[-1] == 15)
    ):
        dist = rdMolDescriptors.GetUSRScore(shape_1, shape_2, weights=weights)
        return dist

    raise Exception(
        "Given vectors are not valid USR shape descriptors "
        "or come from different methods. Correct vector lengths"
        "are: 12 for USR, 60 for USRCAT, 15 for Electroshape"
    )

`Atoms Featurizer`¶

`AtomCalculator` ¶

Bases: SerializableCalculator

Base class for computing atom properties compatible with DGLLife

Source code in molfeat/calc/atom.py

class AtomCalculator(SerializableCalculator):
    """
    Base class for computing atom properties compatible with DGLLife
    """

    DEFAULT_FEATURIZER = {
        "atom_one_hot": atom_one_hot,
        "atom_degree_one_hot": atom_degree_one_hot,
        "atom_implicit_valence_one_hot": atom_implicit_valence_one_hot,
        "atom_hybridization_one_hot": atom_hybridization_one_hot,
        "atom_is_aromatic": atom_is_aromatic,
        "atom_formal_charge": atom_formal_charge,
        "atom_num_radical_electrons": atom_num_radical_electrons,
        "atom_is_in_ring": atom_is_in_ring,
        "atom_total_num_H_one_hot": atom_total_num_H_one_hot,
        "atom_chiral_tag_one_hot": atom_chiral_tag_one_hot,
        "atom_is_chiral_center": atom_is_chiral_center,
    }

    def __init__(
        self,
        featurizer_funcs: Dict[str, Callable] = None,
        concat: bool = True,
        name: str = "hv",
    ):
        """
        Init function of the atom property calculator

        Args:
            featurizer_funcs : Mapping of feature name to the featurization function.
                For compatibility a list of callable/function is still accepted, and the corresponding
                featurizer name will be automatically generated. Each function is of signature
                ``func(rdkit.Chem.rdchem.Atom) -> list or 1D numpy array``.
            concat: Whether to concat all the data into a single value in the output dict
            name: Name of the key name of the concatenated features
        """
        self._input_kwargs = locals().copy()
        self._input_kwargs.pop("self")
        # we also remove the featurizer funcs
        self._input_kwargs.pop("featurizer_funcs", None)
        self._toy_mol = dm.to_mol("CCO")
        self._feat_sizes = dict()
        if featurizer_funcs is None:
            featurizer_funcs = self.DEFAULT_FEATURIZER
        if not isinstance(featurizer_funcs, dict):
            get_name = lambda x: getattr(x, "__name__", repr(x))
            featurizer_funcs = dict((get_name(x), x) for x in featurizer_funcs)
        self.featurizer_funcs = featurizer_funcs
        for k in self.featurizer_funcs.keys():
            self.feat_size(feat_name=k)
        self.concat = concat
        self.name = name

    def to_state_dict(self):
        """
        Convert the Atom calculator to a state dict
        Due to some constraints and cross-version compatibility,  the featurizer functions
        need to be pickled and not just return a list
        """
        state_dict = {}
        state_dict["name"] = self.__class__.__name__
        state_dict["module"] = self.__class__.__module__
        state_dict["args"] = self._input_kwargs
        featurizer_fn_pickled = {}
        for fname, ffunc in self.featurizer_funcs.items():
            featurizer_fn_pickled[fname] = fn_to_hex(ffunc)
        state_dict["args"]["featurizer_funcs"] = featurizer_fn_pickled
        state_dict["_molfeat_version"] = MOLFEAT_VERSION

        signature = inspect.signature(self.__init__)
        val = {
            k: v.default
            for k, v in signature.parameters.items()
            # if v.default is not inspect.Parameter.empty
        }
        to_remove = [k for k in state_dict["args"] if k not in val.keys()]
        for k in to_remove:
            state_dict["args"].pop(k)

        return state_dict

    @classmethod
    def from_state_dict(cls, state_dict, override_args: Optional[dict] = None):
        """Create an instance of an atom calculator from a state dict

        Args:
            state_dict: state dictionary to use to create the atom calculator
            overrride_args: optional dictionary of arguments to override the ones in the state dict
                at construction of the new object
        """
        # EN: at this moment, version compatibility is not enforced
        cls_name = state_dict.get("name", cls.__name__)
        module_name = state_dict.get("module", cls.__module__)
        module = importlib.import_module(module_name)
        klass = getattr(module, cls_name)
        kwargs = state_dict["args"].copy()
        # now we need to unpickle the featurizer functions
        featurizer_fn_pickled = kwargs.pop("featurizer_funcs", None)
        if featurizer_fn_pickled is not None:
            featurizer_fn_loaded = {}
            for k, v in featurizer_fn_pickled.items():
                featurizer_fn_loaded[k] = hex_to_fn(v)
            kwargs["featurizer_funcs"] = featurizer_fn_loaded
        kwargs.update(**(override_args or {}))
        return klass(**kwargs)

    def _concat(self, data_dict: Dict[str, Iterable]):
        """Concatenate the data into a single value

        Args:
            data_dict: mapping of feature names to tensor/arrays
        Returns:
            concatenated_dict: a dict with a single key where all array have been concatenated
        """
        return concat_dict(data_dict, new_name=self.name)

    def feat_size(self, feat_name=None):
        """Get the feature size for ``feat_name``.

        When there is only one feature, users do not need to provide ``feat_name``.

        Args:
            feat_name: Feature for query.

        Returns:
            int: Feature size for the feature with name ``feat_name``. Default to None.
        """
        if feat_name is None:
            assert (
                len(self.featurizer_funcs) == 1
            ), "feat_name should be provided if there are more than one features"
            feat_name = list(self.featurizer_funcs.keys())[0]

        if feat_name not in self.featurizer_funcs:
            raise ValueError(
                "Expect feat_name to be in {}, got {}".format(
                    list(self.featurizer_funcs.keys()), feat_name
                )
            )

        if feat_name not in self._feat_sizes:
            atom = self._toy_mol.GetAtomWithIdx(0)
            self._feat_sizes[feat_name] = len(self.featurizer_funcs[feat_name](atom))
        return self._feat_sizes[feat_name]

    def __len__(self):
        """Get length of the property estimator"""
        return sum(v for k, v in self._feat_sizes.items() if k != self.name)

    def __call__(self, mol: Union[rdchem.Mol, str], dtype: Callable = None):
        """
        Get rdkit basic descriptors for a molecule

        Args:
            mol: the molecule of interest
            dtype: requested data type

        Returns:
            dict:  For each function in self.featurizer_funcs with the key ``k``, store the computed feature under the key ``k``.
        """
        mol = dm.to_mol(mol)
        num_atoms = mol.GetNumAtoms()
        atom_features = defaultdict(list)

        # Compute features for each atom
        for i in range(num_atoms):
            atom = mol.GetAtomWithIdx(i)
            for feat_name, feat_func in self.featurizer_funcs.items():
                atom_features[feat_name].append(feat_func(atom))

        # Stack the features and convert them to float arrays
        processed_features = dict()
        for feat_name, feat_list in atom_features.items():
            feat = np.stack(feat_list).astype(np.float32)
            processed_features[feat_name] = feat

        if self.concat:
            processed_features = self._concat(processed_features)

        if dtype is not None:
            for feat_name, feat in processed_features.items():
                feat = datatype.cast(feat, dtype=dtype)
                processed_features[feat_name] = feat

        return processed_features

DEFAULT_FEATURIZER = {'atom_one_hot': atom_one_hot, 'atom_degree_one_hot': atom_degree_one_hot, 'atom_implicit_valence_one_hot': atom_implicit_valence_one_hot, 'atom_hybridization_one_hot': atom_hybridization_one_hot, 'atom_is_aromatic': atom_is_aromatic, 'atom_formal_charge': atom_formal_charge, 'atom_num_radical_electrons': atom_num_radical_electrons, 'atom_is_in_ring': atom_is_in_ring, 'atom_total_num_H_one_hot': atom_total_num_H_one_hot, 'atom_chiral_tag_one_hot': atom_chiral_tag_one_hot, 'atom_is_chiral_center': atom_is_chiral_center} `class-attribute` ¶

`concat = concat` `instance-attribute` ¶

`featurizer_funcs = featurizer_funcs` `instance-attribute` ¶

`name = name` `instance-attribute` ¶

`call(mol, dtype=None)` ¶

Get rdkit basic descriptors for a molecule

Parameters:

Name	Type	Description	Default
`mol`	`Union[rdchem.Mol, str]`	the molecule of interest	required
`dtype`	`Callable`	requested data type	`None`

Returns:

Name	Type	Description
`dict`		For each function in self.featurizer_funcs with the key `k`, store the computed feature under the key `k`.

Source code in molfeat/calc/atom.py

def __call__(self, mol: Union[rdchem.Mol, str], dtype: Callable = None):
    """
    Get rdkit basic descriptors for a molecule

    Args:
        mol: the molecule of interest
        dtype: requested data type

    Returns:
        dict:  For each function in self.featurizer_funcs with the key ``k``, store the computed feature under the key ``k``.
    """
    mol = dm.to_mol(mol)
    num_atoms = mol.GetNumAtoms()
    atom_features = defaultdict(list)

    # Compute features for each atom
    for i in range(num_atoms):
        atom = mol.GetAtomWithIdx(i)
        for feat_name, feat_func in self.featurizer_funcs.items():
            atom_features[feat_name].append(feat_func(atom))

    # Stack the features and convert them to float arrays
    processed_features = dict()
    for feat_name, feat_list in atom_features.items():
        feat = np.stack(feat_list).astype(np.float32)
        processed_features[feat_name] = feat

    if self.concat:
        processed_features = self._concat(processed_features)

    if dtype is not None:
        for feat_name, feat in processed_features.items():
            feat = datatype.cast(feat, dtype=dtype)
            processed_features[feat_name] = feat

    return processed_features

`init(featurizer_funcs=None, concat=True, name='hv')` ¶

Init function of the atom property calculator

Parameters:

Name	Type	Description	Default
`featurizer_funcs`		Mapping of feature name to the featurization function. For compatibility a list of callable/function is still accepted, and the corresponding featurizer name will be automatically generated. Each function is of signature `func(rdkit.Chem.rdchem.Atom) -> list or 1D numpy array`.	`None`
`concat`	`bool`	Whether to concat all the data into a single value in the output dict	`True`
`name`	`str`	Name of the key name of the concatenated features	`'hv'`

Source code in molfeat/calc/atom.py

def __init__(
    self,
    featurizer_funcs: Dict[str, Callable] = None,
    concat: bool = True,
    name: str = "hv",
):
    """
    Init function of the atom property calculator

    Args:
        featurizer_funcs : Mapping of feature name to the featurization function.
            For compatibility a list of callable/function is still accepted, and the corresponding
            featurizer name will be automatically generated. Each function is of signature
            ``func(rdkit.Chem.rdchem.Atom) -> list or 1D numpy array``.
        concat: Whether to concat all the data into a single value in the output dict
        name: Name of the key name of the concatenated features
    """
    self._input_kwargs = locals().copy()
    self._input_kwargs.pop("self")
    # we also remove the featurizer funcs
    self._input_kwargs.pop("featurizer_funcs", None)
    self._toy_mol = dm.to_mol("CCO")
    self._feat_sizes = dict()
    if featurizer_funcs is None:
        featurizer_funcs = self.DEFAULT_FEATURIZER
    if not isinstance(featurizer_funcs, dict):
        get_name = lambda x: getattr(x, "__name__", repr(x))
        featurizer_funcs = dict((get_name(x), x) for x in featurizer_funcs)
    self.featurizer_funcs = featurizer_funcs
    for k in self.featurizer_funcs.keys():
        self.feat_size(feat_name=k)
    self.concat = concat
    self.name = name

`len()` ¶

Get length of the property estimator

Source code in molfeat/calc/atom.py

def __len__(self):
    """Get length of the property estimator"""
    return sum(v for k, v in self._feat_sizes.items() if k != self.name)

`feat_size(feat_name=None)` ¶

Get the feature size for feat_name.

When there is only one feature, users do not need to provide feat_name.

Parameters:

Name	Type	Description	Default
`feat_name`		Feature for query.	`None`

Returns:

Name	Type	Description
`int`		Feature size for the feature with name `feat_name`. Default to None.

Source code in molfeat/calc/atom.py

def feat_size(self, feat_name=None):
    """Get the feature size for ``feat_name``.

    When there is only one feature, users do not need to provide ``feat_name``.

    Args:
        feat_name: Feature for query.

    Returns:
        int: Feature size for the feature with name ``feat_name``. Default to None.
    """
    if feat_name is None:
        assert (
            len(self.featurizer_funcs) == 1
        ), "feat_name should be provided if there are more than one features"
        feat_name = list(self.featurizer_funcs.keys())[0]

    if feat_name not in self.featurizer_funcs:
        raise ValueError(
            "Expect feat_name to be in {}, got {}".format(
                list(self.featurizer_funcs.keys()), feat_name
            )
        )

    if feat_name not in self._feat_sizes:
        atom = self._toy_mol.GetAtomWithIdx(0)
        self._feat_sizes[feat_name] = len(self.featurizer_funcs[feat_name](atom))
    return self._feat_sizes[feat_name]

`from_state_dict(state_dict, override_args=None)` `classmethod` ¶

Create an instance of an atom calculator from a state dict

Parameters:

Name	Type	Description	Default
`state_dict`		state dictionary to use to create the atom calculator	required
`overrride_args`		optional dictionary of arguments to override the ones in the state dict at construction of the new object	required

Source code in molfeat/calc/atom.py

@classmethod
def from_state_dict(cls, state_dict, override_args: Optional[dict] = None):
    """Create an instance of an atom calculator from a state dict

    Args:
        state_dict: state dictionary to use to create the atom calculator
        overrride_args: optional dictionary of arguments to override the ones in the state dict
            at construction of the new object
    """
    # EN: at this moment, version compatibility is not enforced
    cls_name = state_dict.get("name", cls.__name__)
    module_name = state_dict.get("module", cls.__module__)
    module = importlib.import_module(module_name)
    klass = getattr(module, cls_name)
    kwargs = state_dict["args"].copy()
    # now we need to unpickle the featurizer functions
    featurizer_fn_pickled = kwargs.pop("featurizer_funcs", None)
    if featurizer_fn_pickled is not None:
        featurizer_fn_loaded = {}
        for k, v in featurizer_fn_pickled.items():
            featurizer_fn_loaded[k] = hex_to_fn(v)
        kwargs["featurizer_funcs"] = featurizer_fn_loaded
    kwargs.update(**(override_args or {}))
    return klass(**kwargs)

`to_state_dict()` ¶

Convert the Atom calculator to a state dict Due to some constraints and cross-version compatibility, the featurizer functions need to be pickled and not just return a list

Source code in molfeat/calc/atom.py

def to_state_dict(self):
    """
    Convert the Atom calculator to a state dict
    Due to some constraints and cross-version compatibility,  the featurizer functions
    need to be pickled and not just return a list
    """
    state_dict = {}
    state_dict["name"] = self.__class__.__name__
    state_dict["module"] = self.__class__.__module__
    state_dict["args"] = self._input_kwargs
    featurizer_fn_pickled = {}
    for fname, ffunc in self.featurizer_funcs.items():
        featurizer_fn_pickled[fname] = fn_to_hex(ffunc)
    state_dict["args"]["featurizer_funcs"] = featurizer_fn_pickled
    state_dict["_molfeat_version"] = MOLFEAT_VERSION

    signature = inspect.signature(self.__init__)
    val = {
        k: v.default
        for k, v in signature.parameters.items()
        # if v.default is not inspect.Parameter.empty
    }
    to_remove = [k for k in state_dict["args"] if k not in val.keys()]
    for k in to_remove:
        state_dict["args"].pop(k)

    return state_dict

`AtomMaterialCalculator` ¶

Bases: AtomCalculator

Atom calculator with the extend atomic property list which have been collected from various material science packages

Source code in molfeat/calc/atom.py

class AtomMaterialCalculator(AtomCalculator):
    """Atom calculator with the extend atomic property list
    which have been collected from various material science packages
    """

    DEFAULT_FEATURIZER = {
        "atom_one_hot": atom_one_hot,
        "atom_extended_properties": atom_extended_properties,
        "atom_degree_one_hot": atom_degree_one_hot,
        "atom_implicit_valence_one_hot": atom_implicit_valence_one_hot,
        "atom_hybridization_one_hot": atom_hybridization_one_hot,
        "atom_is_aromatic": atom_is_aromatic,
        "atom_formal_charge": atom_formal_charge,
        "atom_num_radical_electrons": atom_num_radical_electrons,
        "atom_is_in_ring": atom_is_in_ring,
        "atom_chiral_tag_one_hot": atom_chiral_tag_one_hot,
        "atom_is_chiral_center": atom_is_chiral_center,
    }

DEFAULT_FEATURIZER = {'atom_one_hot': atom_one_hot, 'atom_extended_properties': atom_extended_properties, 'atom_degree_one_hot': atom_degree_one_hot, 'atom_implicit_valence_one_hot': atom_implicit_valence_one_hot, 'atom_hybridization_one_hot': atom_hybridization_one_hot, 'atom_is_aromatic': atom_is_aromatic, 'atom_formal_charge': atom_formal_charge, 'atom_num_radical_electrons': atom_num_radical_electrons, 'atom_is_in_ring': atom_is_in_ring, 'atom_chiral_tag_one_hot': atom_chiral_tag_one_hot, 'atom_is_chiral_center': atom_is_chiral_center} `class-attribute` ¶

`DGLCanonicalAtomCalculator` ¶

Bases: AtomCalculator

Default canonical featurizer for atoms used by dgllife

Source code in molfeat/calc/atom.py

class DGLCanonicalAtomCalculator(AtomCalculator):
    """Default canonical featurizer for atoms used by dgllife"""

    DEFAULT_FEATURIZER = {
        "atom_one_hot": atom_one_hot,
        "atom_degree_one_hot": atom_degree_one_hot,
        "atom_implicit_valence_one_hot": atom_implicit_valence_one_hot,
        "atom_formal_charge": atom_formal_charge,
        "atom_num_radical_electrons": atom_num_radical_electrons,
        "atom_hybridization_one_hot": partial(
            atom_hybridization_one_hot, allowable_set=DGLLIFE_HYBRIDIZATION_LIST
        ),
        "atom_is_aromatic": atom_is_aromatic,
        "atom_total_num_H_one_hot": atom_total_num_H_one_hot,
    }

    def _concat(self, data_dict: Dict[str, Iterable]):
        """Concatenate the data into a single value

        Args:
            data_dict: mapping of feature names to tensor/arrays
        Returns:
            concatenated_dict: a dict with a single key where all array have been concatenated
        """
        out = concat_dict(data_dict, new_name=self.name, order=list(self.featurizer_funcs.keys()))
        return out

`DEFAULT_FEATURIZER = {'atom_one_hot': atom_one_hot, 'atom_degree_one_hot': atom_degree_one_hot, 'atom_implicit_valence_one_hot': atom_implicit_valence_one_hot, 'atom_formal_charge': atom_formal_charge, 'atom_num_radical_electrons': atom_num_radical_electrons, 'atom_hybridization_one_hot': partial(atom_hybridization_one_hot, allowable_set=DGLLIFE_HYBRIDIZATION_LIST), 'atom_is_aromatic': atom_is_aromatic, 'atom_total_num_H_one_hot': atom_total_num_H_one_hot}` `class-attribute` ¶

`DGLWeaveAtomCalculator` ¶

Bases: DGLCanonicalAtomCalculator

Default atom featurizer used by WeaveNet in DGLLife

Source code in molfeat/calc/atom.py

class DGLWeaveAtomCalculator(DGLCanonicalAtomCalculator):
    """Default atom featurizer used by WeaveNet in DGLLife"""

    DEFAULT_FEATURIZER = {
        "atom_one_hot": partial(
            atom_one_hot, allowable_set=DGLLIFE_WEAVE_ATOMS, encode_unknown=True
        ),
        "atom_chiral_tag_one_hot": partial(
            atom_chiral_tag_one_hot, allowable_set=DGLLIFE_WEAVE_CHIRAL_TYPES
        ),
        "atom_formal_charge": atom_formal_charge,
        "atom_partial_charge": atom_partial_charge,
        "atom_is_aromatic": atom_is_aromatic,
        "atom_hybridization_one_hot": partial(
            atom_hybridization_one_hot, allowable_set=DGLLIFE_HYBRIDIZATION_LIST[:3]
        ),
    }

    def __init__(self, concat: bool = True, name: str = "hv"):
        featurizer_funcs = self.DEFAULT_FEATURIZER
        featurizer_funcs["atom_weavenet_props"] = self.atom_weave_props
        super().__init__(concat=concat, name=name, featurizer_funcs=featurizer_funcs)

    def _get_atom_state_info(self, feats):
        """Get atom Donor/Acceptor state information from chemical pharmacophore features

        Args:
            feats: computed chemical features
        """
        is_donor = defaultdict(bool)
        is_acceptor = defaultdict(bool)
        # Get hydrogen bond donor/acceptor information
        for feats in feats:
            if feats.GetFamily() == "Donor":
                nodes = feats.GetAtomIds()
                for u in nodes:
                    is_donor[u] = True
            elif feats.GetFamily() == "Acceptor":
                nodes = feats.GetAtomIds()
                for u in nodes:
                    is_acceptor[u] = True
        return is_donor, is_acceptor

    @lru_cache
    def _compute_weave_net_properties(self, mol: rdchem.Mol):
        # Get information for donor and acceptor
        fdef_name = os.path.join(RDConfig.RDDataDir, "BaseFeatures.fdef")
        chem_feats = ChemicalFeatures.BuildFeatureFactory(fdef_name)
        mol_feats = chem_feats.GetFeaturesForMol(mol)
        is_donor, is_acceptor = self._get_atom_state_info(mol_feats)
        sssr = GetSymmSSSR(mol)
        num_atoms = mol.GetNumAtoms()
        atom_features = []
        for i in range(num_atoms):
            cur_atom_props = [float(is_donor[i]), float(is_acceptor[i])]
            # Count the number of rings the atom belongs to for ring size between 3 and 8
            count = [0 for _ in range(3, 9)]
            for ring in sssr:
                ring_size = len(ring)
                if i in ring and 3 <= ring_size <= 8:
                    count[ring_size - 3] += 1
            cur_atom_props.extend(count)
            atom_features.append(cur_atom_props)
        return atom_features

    def atom_weave_props(self, atom: rdchem.Atom):
        """Get the WeaveNet properties for an atom"""
        mol = atom.GetOwningMol()
        feats = self._compute_weave_net_properties(mol)
        return feats[atom.GetIdx()]

    def __call__(self, mol: Union[rdchem.Mol, str], dtype: Callable = None):
        """
        Get rdkit basic descriptors for a molecule

        Args:
            mol: the molecule of interest
            dtype: requested data type

        Returns:
            dict:  For each function in self.featurizer_funcs with the key ``k``, store the computed feature under the key ``k``.
        """
        AllChem.ComputeGasteigerCharges(mol)
        return super().__call__(
            mol,
            dtype,
        )

`DEFAULT_FEATURIZER = {'atom_one_hot': partial(atom_one_hot, allowable_set=DGLLIFE_WEAVE_ATOMS, encode_unknown=True), 'atom_chiral_tag_one_hot': partial(atom_chiral_tag_one_hot, allowable_set=DGLLIFE_WEAVE_CHIRAL_TYPES), 'atom_formal_charge': atom_formal_charge, 'atom_partial_charge': atom_partial_charge, 'atom_is_aromatic': atom_is_aromatic, 'atom_hybridization_one_hot': partial(atom_hybridization_one_hot, allowable_set=DGLLIFE_HYBRIDIZATION_LIST[:3])}` `class-attribute` ¶

`call(mol, dtype=None)` ¶

Get rdkit basic descriptors for a molecule

Parameters:

Name	Type	Description	Default
`mol`	`Union[rdchem.Mol, str]`	the molecule of interest	required
`dtype`	`Callable`	requested data type	`None`

Returns:

Name	Type	Description
`dict`		For each function in self.featurizer_funcs with the key `k`, store the computed feature under the key `k`.

Source code in molfeat/calc/atom.py

def __call__(self, mol: Union[rdchem.Mol, str], dtype: Callable = None):
    """
    Get rdkit basic descriptors for a molecule

    Args:
        mol: the molecule of interest
        dtype: requested data type

    Returns:
        dict:  For each function in self.featurizer_funcs with the key ``k``, store the computed feature under the key ``k``.
    """
    AllChem.ComputeGasteigerCharges(mol)
    return super().__call__(
        mol,
        dtype,
    )

`init(concat=True, name='hv')` ¶

Source code in molfeat/calc/atom.py

def __init__(self, concat: bool = True, name: str = "hv"):
    featurizer_funcs = self.DEFAULT_FEATURIZER
    featurizer_funcs["atom_weavenet_props"] = self.atom_weave_props
    super().__init__(concat=concat, name=name, featurizer_funcs=featurizer_funcs)

`atom_weave_props(atom)` ¶

Get the WeaveNet properties for an atom

Source code in molfeat/calc/atom.py

def atom_weave_props(self, atom: rdchem.Atom):
    """Get the WeaveNet properties for an atom"""
    mol = atom.GetOwningMol()
    feats = self._compute_weave_net_properties(mol)
    return feats[atom.GetIdx()]

`Bonds Featurizer`¶

`BondCalculator` ¶

Bases: SerializableCalculator

A class for bond featurizer which loops over all bonds in a molecule and featurizes them with the featurizer_funcs. The constructed graph is assumed to be a bi-directed graph by default.

Source code in molfeat/calc/bond.py

class BondCalculator(SerializableCalculator):
    """
    A class for bond featurizer which loops over all bonds in a molecule and
    featurizes them with the ``featurizer_funcs``. The constructed graph is assumed to be
    a bi-directed graph by default.
    """

    DEFAULT_FEATURIZER = {
        "bond_type_one_hot": bond_type_one_hot,
        "bond_stereo_one_hot": bond_stereo_one_hot,
        "bond_is_in_ring": bond_is_in_ring,
        "bond_is_conjugated": bond_is_conjugated,
        "bond_direction_one_hot": bond_direction_one_hot,
    }

    def __init__(
        self,
        featurizer_funcs: Union[list, dict] = None,
        self_loop: bool = False,
        concat: bool = True,
        name: str = "he",
    ):
        """
        Init function of the bond property calculator

        Args:
            featurizer_funcs: Mapping feature name to the featurization function.
            self_loop: Whether self loops will be added. Default to False. If True, an additional
                column of binary values to indicate the identity of self loops will be added.
                The other features of the self loops will be zero.
            concat: Whether to concat all the data into a single value in the output dict
            name: Name of the key name of the concatenated features
        """
        self._input_kwargs = locals().copy()
        self._input_kwargs.pop("self")
        # remove featurizer_funcs too
        self._input_kwargs.pop("featurizer_funcs", None)
        self._toy_mol = dm.to_mol("CO")
        self._feat_sizes = dict()
        if featurizer_funcs is None:
            featurizer_funcs = self.DEFAULT_FEATURIZER
        if not isinstance(featurizer_funcs, dict):
            get_name = lambda x: getattr(x, "__name__", repr(x))
            featurizer_funcs = dict((get_name(x), x) for x in featurizer_funcs)
        self.featurizer_funcs = featurizer_funcs
        self._self_loop = self_loop
        self.concat = concat
        self.name = name
        for k in self.featurizer_funcs.keys():
            self.feat_size(feat_name=k)
        if self._self_loop:
            self._feat_sizes["self_loop"] = 1

    def to_state_dict(self):
        """Convert the Atom calculator to a state dict
        Due to some constraints and cross-version compatibility,  the featurizer functions
        need to be pickled and not just list
        """
        state_dict = {}
        state_dict["name"] = self.__class__.__name__
        state_dict["module"] = self.__class__.__module__
        state_dict["args"] = self._input_kwargs

        featurizer_fn_pickled = {}
        for fname, ffunc in self.featurizer_funcs.items():
            featurizer_fn_pickled[fname] = fn_to_hex(ffunc)
        state_dict["args"]["featurizer_funcs"] = featurizer_fn_pickled
        state_dict["_molfeat_version"] = MOLFEAT_VERSION
        signature = inspect.signature(self.__init__)
        val = {
            k: v.default
            for k, v in signature.parameters.items()
            #    if v.default is not inspect.Parameter.empty
        }
        to_remove = [k for k in state_dict["args"] if k not in val.keys()]
        for k in to_remove:
            state_dict["args"].pop(k)
        return state_dict

    @classmethod
    def from_state_dict(cls, state_dict, override_args: Optional[dict] = None):
        """Create an instance of an atom calculator from a state dict

        Args:
            state_dict: state dictionary to use to create the atom calculator
            overrride_args: optional dictionary of arguments to override the ones in the state dict
                at construction of the new object
        """
        # EN: at this moment, version compatibility is not enforced
        cls_name = state_dict.get("name", cls.__name__)
        module_name = state_dict.get("module", cls.__module__)
        module = importlib.import_module(module_name)
        klass = getattr(module, cls_name)

        kwargs = state_dict["args"].copy()
        # now we need to unpickle the featurizer functions
        featurizer_fn_pickled = kwargs.pop("featurizer_funcs", None)
        if featurizer_fn_pickled is not None:
            featurizer_fn_loaded = {}
            for k, v in featurizer_fn_pickled.items():
                featurizer_fn_loaded[k] = hex_to_fn(v)
            kwargs["featurizer_funcs"] = featurizer_fn_loaded
        kwargs.update(**(override_args or {}))
        return klass(**kwargs)

    def _concat(self, data_dict: Dict[str, Iterable]):
        """Concatenate the data into a single value

        Args:
            data_dict: mapping of feature names to tensor/arrays
        Returns:
            concatenated_dict: a dict with a single key where all array have been concatenated
        """
        return concat_dict(data_dict, new_name=self.name)

    def feat_size(self, feat_name: Optional[str] = None):
        """Get the feature size for ``feat_name``.

        When there is only one feature, ``feat_name`` can be None.

        Args:
            feat_name: Feature for query.

        Returns:
            int: Feature size for the feature with name ``feat_name``. Default to None.
        """
        if feat_name is None:
            assert (
                len(self.featurizer_funcs) == 1
            ), "feat_name should be provided if there are more than one features"
            feat_name = list(self.featurizer_funcs.keys())[0]

        if feat_name not in self.featurizer_funcs:
            raise ValueError(
                "Expect feat_name to be in {}, got {}".format(
                    list(self.featurizer_funcs.keys()), feat_name
                )
            )
        if feat_name not in self._feat_sizes:
            bond = self._toy_mol.GetBondWithIdx(0)
            self._feat_sizes[feat_name] = len(self.featurizer_funcs[feat_name](bond))
        return self._feat_sizes[feat_name]

    def __len__(self):
        """Get length of the property estimator"""
        return sum(v for k, v in self._feat_sizes.items() if k != self.name)

    def __call__(self, mol: Union[rdchem.Mol, str], dtype: Callable = None, **kwargs):
        """Featurize all bonds in a molecule.

        Args:
            mol: the molecule of interest
            dtype: requested data type

        Returns:
            dict: For each function in self.featurizer_funcs with the key ``k``,
                store the computed feature under the key ``k``.
        """
        mol = dm.to_mol(mol)
        num_bonds = mol.GetNumBonds()
        bond_features = defaultdict(list)

        # Compute features for each bond
        for i in range(num_bonds):
            bond = mol.GetBondWithIdx(i)
            for feat_name, feat_func in self.featurizer_funcs.items():
                feat = feat_func(bond)
                bond_features[feat_name].extend([feat, feat.copy()])

        # Stack the features and convert them to float arrays
        processed_features = dict()
        for feat_name, feat_list in bond_features.items():
            feat = np.stack(feat_list)
            processed_features[feat_name] = feat

        if self._self_loop and num_bonds > 0:
            num_atoms = mol.GetNumAtoms()
            for feat_name in processed_features:
                feats = processed_features[feat_name]
                # add a new label that says the feat are not self loop
                # feats = np.concatenate([feats, np.zeros((feats.shape[0], 1))], axis=1)
                # add a label at the last position that says it's a selfloop
                add_edges = np.zeros((num_atoms, feats.shape[1]))
                # self_loop_feats[:, -1] = 1
                feats = np.concatenate([feats, add_edges], axis=0)
                processed_features[feat_name] = feats
            self_loop_feats = np.concatenate(
                [np.zeros((num_bonds * 2, 1)), np.ones((num_atoms, 1))]
            )

            processed_features["self_loop"] = self_loop_feats

        if self._self_loop and num_bonds == 0:
            num_atoms = mol.GetNumAtoms()
            old_concat = self.concat
            self.concat = False
            processed_features = self(self._toy_mol)
            self.concat = old_concat
            for feat_name in processed_features:
                feats = processed_features[feat_name]
                feats = np.zeros((num_atoms, feats.shape[1]))
                processed_features[feat_name] = feats
        if self.concat and (num_bonds > 0 or self._self_loop):
            processed_features = self._concat(processed_features)
        if dtype is not None:
            for feat_name, feat in processed_features.items():
                feat = datatype.cast(feat, dtype=dtype)
                processed_features[feat_name] = feat

        return processed_features

`DEFAULT_FEATURIZER = {'bond_type_one_hot': bond_type_one_hot, 'bond_stereo_one_hot': bond_stereo_one_hot, 'bond_is_in_ring': bond_is_in_ring, 'bond_is_conjugated': bond_is_conjugated, 'bond_direction_one_hot': bond_direction_one_hot}` `class-attribute` ¶

`concat = concat` `instance-attribute` ¶

`featurizer_funcs = featurizer_funcs` `instance-attribute` ¶

`name = name` `instance-attribute` ¶

`call(mol, dtype=None, **kwargs)` ¶

Featurize all bonds in a molecule.

Parameters:

Name	Type	Description	Default
`mol`	`Union[rdchem.Mol, str]`	the molecule of interest	required
`dtype`	`Callable`	requested data type	`None`

Returns:

Name	Type	Description
`dict`		For each function in self.featurizer_funcs with the key `k`, store the computed feature under the key `k`.

Source code in molfeat/calc/bond.py

def __call__(self, mol: Union[rdchem.Mol, str], dtype: Callable = None, **kwargs):
    """Featurize all bonds in a molecule.

    Args:
        mol: the molecule of interest
        dtype: requested data type

    Returns:
        dict: For each function in self.featurizer_funcs with the key ``k``,
            store the computed feature under the key ``k``.
    """
    mol = dm.to_mol(mol)
    num_bonds = mol.GetNumBonds()
    bond_features = defaultdict(list)

    # Compute features for each bond
    for i in range(num_bonds):
        bond = mol.GetBondWithIdx(i)
        for feat_name, feat_func in self.featurizer_funcs.items():
            feat = feat_func(bond)
            bond_features[feat_name].extend([feat, feat.copy()])

    # Stack the features and convert them to float arrays
    processed_features = dict()
    for feat_name, feat_list in bond_features.items():
        feat = np.stack(feat_list)
        processed_features[feat_name] = feat

    if self._self_loop and num_bonds > 0:
        num_atoms = mol.GetNumAtoms()
        for feat_name in processed_features:
            feats = processed_features[feat_name]
            # add a new label that says the feat are not self loop
            # feats = np.concatenate([feats, np.zeros((feats.shape[0], 1))], axis=1)
            # add a label at the last position that says it's a selfloop
            add_edges = np.zeros((num_atoms, feats.shape[1]))
            # self_loop_feats[:, -1] = 1
            feats = np.concatenate([feats, add_edges], axis=0)
            processed_features[feat_name] = feats
        self_loop_feats = np.concatenate(
            [np.zeros((num_bonds * 2, 1)), np.ones((num_atoms, 1))]
        )

        processed_features["self_loop"] = self_loop_feats

    if self._self_loop and num_bonds == 0:
        num_atoms = mol.GetNumAtoms()
        old_concat = self.concat
        self.concat = False
        processed_features = self(self._toy_mol)
        self.concat = old_concat
        for feat_name in processed_features:
            feats = processed_features[feat_name]
            feats = np.zeros((num_atoms, feats.shape[1]))
            processed_features[feat_name] = feats
    if self.concat and (num_bonds > 0 or self._self_loop):
        processed_features = self._concat(processed_features)
    if dtype is not None:
        for feat_name, feat in processed_features.items():
            feat = datatype.cast(feat, dtype=dtype)
            processed_features[feat_name] = feat

    return processed_features

`init(featurizer_funcs=None, self_loop=False, concat=True, name='he')` ¶

Init function of the bond property calculator

Parameters:

Name	Type	Description	Default
`featurizer_funcs`	`Union[list, dict]`	Mapping feature name to the featurization function.	`None`
`self_loop`	`bool`	Whether self loops will be added. Default to False. If True, an additional column of binary values to indicate the identity of self loops will be added. The other features of the self loops will be zero.	`False`
`concat`	`bool`	Whether to concat all the data into a single value in the output dict	`True`
`name`	`str`	Name of the key name of the concatenated features	`'he'`

Source code in molfeat/calc/bond.py

def __init__(
    self,
    featurizer_funcs: Union[list, dict] = None,
    self_loop: bool = False,
    concat: bool = True,
    name: str = "he",
):
    """
    Init function of the bond property calculator

    Args:
        featurizer_funcs: Mapping feature name to the featurization function.
        self_loop: Whether self loops will be added. Default to False. If True, an additional
            column of binary values to indicate the identity of self loops will be added.
            The other features of the self loops will be zero.
        concat: Whether to concat all the data into a single value in the output dict
        name: Name of the key name of the concatenated features
    """
    self._input_kwargs = locals().copy()
    self._input_kwargs.pop("self")
    # remove featurizer_funcs too
    self._input_kwargs.pop("featurizer_funcs", None)
    self._toy_mol = dm.to_mol("CO")
    self._feat_sizes = dict()
    if featurizer_funcs is None:
        featurizer_funcs = self.DEFAULT_FEATURIZER
    if not isinstance(featurizer_funcs, dict):
        get_name = lambda x: getattr(x, "__name__", repr(x))
        featurizer_funcs = dict((get_name(x), x) for x in featurizer_funcs)
    self.featurizer_funcs = featurizer_funcs
    self._self_loop = self_loop
    self.concat = concat
    self.name = name
    for k in self.featurizer_funcs.keys():
        self.feat_size(feat_name=k)
    if self._self_loop:
        self._feat_sizes["self_loop"] = 1

`len()` ¶

Get length of the property estimator

Source code in molfeat/calc/bond.py

def __len__(self):
    """Get length of the property estimator"""
    return sum(v for k, v in self._feat_sizes.items() if k != self.name)

`feat_size(feat_name=None)` ¶

Get the feature size for feat_name.

When there is only one feature, feat_name can be None.

Parameters:

Name	Type	Description	Default
`feat_name`	`Optional[str]`	Feature for query.	`None`

Returns:

Name	Type	Description
`int`		Feature size for the feature with name `feat_name`. Default to None.

Source code in molfeat/calc/bond.py

def feat_size(self, feat_name: Optional[str] = None):
    """Get the feature size for ``feat_name``.

    When there is only one feature, ``feat_name`` can be None.

    Args:
        feat_name: Feature for query.

    Returns:
        int: Feature size for the feature with name ``feat_name``. Default to None.
    """
    if feat_name is None:
        assert (
            len(self.featurizer_funcs) == 1
        ), "feat_name should be provided if there are more than one features"
        feat_name = list(self.featurizer_funcs.keys())[0]

    if feat_name not in self.featurizer_funcs:
        raise ValueError(
            "Expect feat_name to be in {}, got {}".format(
                list(self.featurizer_funcs.keys()), feat_name
            )
        )
    if feat_name not in self._feat_sizes:
        bond = self._toy_mol.GetBondWithIdx(0)
        self._feat_sizes[feat_name] = len(self.featurizer_funcs[feat_name](bond))
    return self._feat_sizes[feat_name]

`from_state_dict(state_dict, override_args=None)` `classmethod` ¶

Create an instance of an atom calculator from a state dict

Parameters:

Name	Type	Description	Default
`state_dict`		state dictionary to use to create the atom calculator	required
`overrride_args`		optional dictionary of arguments to override the ones in the state dict at construction of the new object	required

Source code in molfeat/calc/bond.py

@classmethod
def from_state_dict(cls, state_dict, override_args: Optional[dict] = None):
    """Create an instance of an atom calculator from a state dict

    Args:
        state_dict: state dictionary to use to create the atom calculator
        overrride_args: optional dictionary of arguments to override the ones in the state dict
            at construction of the new object
    """
    # EN: at this moment, version compatibility is not enforced
    cls_name = state_dict.get("name", cls.__name__)
    module_name = state_dict.get("module", cls.__module__)
    module = importlib.import_module(module_name)
    klass = getattr(module, cls_name)

    kwargs = state_dict["args"].copy()
    # now we need to unpickle the featurizer functions
    featurizer_fn_pickled = kwargs.pop("featurizer_funcs", None)
    if featurizer_fn_pickled is not None:
        featurizer_fn_loaded = {}
        for k, v in featurizer_fn_pickled.items():
            featurizer_fn_loaded[k] = hex_to_fn(v)
        kwargs["featurizer_funcs"] = featurizer_fn_loaded
    kwargs.update(**(override_args or {}))
    return klass(**kwargs)

`to_state_dict()` ¶

Convert the Atom calculator to a state dict Due to some constraints and cross-version compatibility, the featurizer functions need to be pickled and not just list

Source code in molfeat/calc/bond.py

def to_state_dict(self):
    """Convert the Atom calculator to a state dict
    Due to some constraints and cross-version compatibility,  the featurizer functions
    need to be pickled and not just list
    """
    state_dict = {}
    state_dict["name"] = self.__class__.__name__
    state_dict["module"] = self.__class__.__module__
    state_dict["args"] = self._input_kwargs

    featurizer_fn_pickled = {}
    for fname, ffunc in self.featurizer_funcs.items():
        featurizer_fn_pickled[fname] = fn_to_hex(ffunc)
    state_dict["args"]["featurizer_funcs"] = featurizer_fn_pickled
    state_dict["_molfeat_version"] = MOLFEAT_VERSION
    signature = inspect.signature(self.__init__)
    val = {
        k: v.default
        for k, v in signature.parameters.items()
        #    if v.default is not inspect.Parameter.empty
    }
    to_remove = [k for k in state_dict["args"] if k not in val.keys()]
    for k in to_remove:
        state_dict["args"].pop(k)
    return state_dict

`DGLCanonicalBondCalculator` ¶

Bases: BondCalculator

Source code in molfeat/calc/bond.py

class DGLCanonicalBondCalculator(BondCalculator):
    DEFAULT_FEATURIZER = {
        "bond_type_one_hot": bond_type_one_hot,
        "bond_is_conjugated": bond_is_conjugated,
        "bond_is_in_ring": bond_is_in_ring,
        "bond_stereo_one_hot": bond_stereo_one_hot,
    }

    def _concat(self, data_dict: Dict[str, Iterable]):
        """Concatenate the data into a single value

        Args:
            data_dict: mapping of feature names to tensor/arrays
        Returns:
            concatenated_dict: a dict with a single key where all array have been concatenated
        """
        return concat_dict(data_dict, new_name=self.name, order=list(self.featurizer_funcs.keys()))

`DEFAULT_FEATURIZER = {'bond_type_one_hot': bond_type_one_hot, 'bond_is_conjugated': bond_is_conjugated, 'bond_is_in_ring': bond_is_in_ring, 'bond_stereo_one_hot': bond_stereo_one_hot}` `class-attribute` ¶

`DGLWeaveEdgeCalculator` ¶

Bases: EdgeMatCalculator

Edge featurizer used by WeaveNets

The edge featurization is introduced in Molecular Graph Convolutions: Moving Beyond Fingerprints <https://arxiv.org/abs/1603.00856>__.

This featurization is performed for a complete graph of atoms with self loops added, which considers the following default:

Number of bonds between each pairs of atoms
One-hot encoding of bond type if a bond exists between a pair of atoms
Whether a pair of atoms belongs to a same ring

Source code in molfeat/calc/bond.py

class DGLWeaveEdgeCalculator(EdgeMatCalculator):
    """Edge featurizer used by WeaveNets

    The edge featurization is introduced in `Molecular Graph Convolutions:
    Moving Beyond Fingerprints <https://arxiv.org/abs/1603.00856>`__.

    This featurization is performed for a complete graph of atoms with self loops added,
    which considers the following default:

    * Number of bonds between each pairs of atoms
    * One-hot encoding of bond type if a bond exists between a pair of atoms
    * Whether a pair of atoms belongs to a same ring

    """

    DEFAULT_FEATURIZER = {}
    DEFAULT_PAIRWISE_FEATURIZER = {
        "pairwise_dist_indicator": pairwise_dist_indicator,
        "pairwise_bond_indicator": pairwise_bond_indicator,
        "pairwise_ring_membership": pairwise_ring_membership,
    }

    def _concat(self, data_dict: Dict[str, Iterable]):
        """Concatenate the data into a single value

        Args:
            data_dict: mapping of feature names to tensor/arrays
        Returns:
            concatenated_dict: a dict with a single key where all array have been concatenated
        """

        # To reproduce DGLDefault, we need to keep the order of dict insertion
        return concat_dict(
            data_dict, new_name=self.name, order=list(self.pairwise_atom_funcs.keys())
        )

`DEFAULT_FEATURIZER = {}` `class-attribute` ¶

`DEFAULT_PAIRWISE_FEATURIZER = {'pairwise_dist_indicator': pairwise_dist_indicator, 'pairwise_bond_indicator': pairwise_bond_indicator, 'pairwise_ring_membership': pairwise_ring_membership}` `class-attribute` ¶

`EdgeMatCalculator` ¶

Bases: BondCalculator

Generate edge featurizer matrix

Source code in molfeat/calc/bond.py

class EdgeMatCalculator(BondCalculator):
    """Generate edge featurizer matrix"""

    DEFAULT_PAIRWISE_FEATURIZER = {
        "pairwise_2D_dist": pairwise_2D_dist,
        # "pairwise_3D_dist": pairwise_3D_dist,
        "pairwise_ring_membership": pairwise_ring_membership,
    }

    def __init__(
        self,
        featurizer_funcs: Union[list, dict] = None,
        pairwise_atom_funcs: Union[list, dict, str] = "default",
        name: str = "he",
    ):
        """
        Init function of the edge matrix property calculator

        Args:
            featurizer_funcs: Mapping feature name to the featurization function.
            pairwise_atom_funcs: Mapping feature name to pairwise featurization function.
                Use the keywords "default" for the default values
        """
        if pairwise_atom_funcs == "default":
            pairwise_atom_funcs = self.DEFAULT_PAIRWISE_FEATURIZER
        if not isinstance(pairwise_atom_funcs, dict):
            get_name = lambda x: getattr(x, "__name__", repr(x))
            pairwise_atom_funcs = dict((get_name(x), x) for x in pairwise_atom_funcs)
        self.pairwise_atom_funcs = pairwise_atom_funcs
        super().__init__(featurizer_funcs=featurizer_funcs, concat=True, name=name)
        # add conf data to toy mol
        self._toy_mol = dm.conformers.generate(self._toy_mol, n_confs=1, minimize_energy=False)
        for k in self.pairwise_atom_funcs.keys():
            self.feat_size(feat_name=k)

    def to_state_dict(self):
        """Convert the Atom calculator to a state dict
        Due to some constraints and cross-version compatibility,  the featurizer functions
        need to be pickled and not just list
        """
        state_dict = super().to_state_dict()
        # repeat for the pairwise one
        pairwise_atom_fn_pickled = {}
        for fname, ffunc in self.pairwise_atom_funcs.items():
            pairwise_atom_fn_pickled[fname] = fn_to_hex(ffunc)
        state_dict["args"]["pairwise_atom_funcs"] = pairwise_atom_fn_pickled
        return state_dict

    @classmethod
    def from_state_dict(cls, state_dict, override_args: Optional[dict] = None):
        """Create an instance of an atom calculator from a state dict

        Args:
            state_dict: state dictionary to use to create the atom calculator
            overrride_args: optional dictionary of arguments to override the ones in the state dict
                at construction of the new object
        """
        # EN: at this moment, version compatibility is not enforced
        cls_name = state_dict.get("name", cls.__name__)
        module_name = state_dict.get("module", cls.__module__)
        module = importlib.import_module(module_name)
        klass = getattr(module, cls_name)

        kwargs = state_dict["args"].copy()
        # now we need to unpickle the featurizer functions
        featurizer_fn_pickled = kwargs.pop("featurizer_funcs", None)
        if featurizer_fn_pickled is not None:
            featurizer_fn_loaded = {}
            for k, v in featurizer_fn_pickled.items():
                featurizer_fn_loaded[k] = hex_to_fn(v)
            kwargs["featurizer_funcs"] = featurizer_fn_loaded

        pairwise_atom_fn_pickled = kwargs.pop("pairwise_atom_funcs", None)
        if pairwise_atom_fn_pickled is not None:
            pairwise_atom_fn_loaded = {}
            for k, v in pairwise_atom_fn_pickled.items():
                pairwise_atom_fn_loaded[k] = hex_to_fn(v)
            kwargs["pairwise_atom_funcs"] = pairwise_atom_fn_loaded
        kwargs.update(**(override_args or {}))
        return klass(**kwargs)

    def feat_size(self, feat_name: Optional[str] = None):
        """Get the feature size for ``feat_name``.

        Args:
            feat_name: Feature for query.

        Returns:
            int: Feature size for the feature with name ``feat_name``. Default to None.
        """
        if feat_name not in self.featurizer_funcs and feat_name not in self.pairwise_atom_funcs:
            raise ValueError(
                "Expect feat_name to be in {}, got {}".format(
                    list(self.featurizer_funcs.keys()), feat_name
                )
            )
        if feat_name not in self._feat_sizes:
            if feat_name in self.featurizer_funcs:
                bond = self._toy_mol.GetBondWithIdx(0)
                self._feat_sizes[feat_name] = len(self.featurizer_funcs[feat_name](bond))
            elif feat_name in self.pairwise_atom_funcs:
                self._feat_sizes[feat_name] = self.pairwise_atom_funcs[feat_name](
                    self._toy_mol
                ).shape[-1]
            else:
                raise ValueError(f"Feature name {feat_name} is not defined !")
        return self._feat_sizes[feat_name]

    def __call__(self, mol: Union[rdchem.Mol, str], dtype: Callable = None, flat: bool = True):
        """Featurize all bonds in a molecule.

        Args:
            mol: the molecule of interest
            dtype: requested data type
            flat: whether to return a collapsed N^2, M or a N, N, M matrix

        Returns:
            dict: For each function in self.featurizer_funcs with the key ``k``,
                store the computed feature under the key ``k``.
        """

        mol = dm.to_mol(mol)
        num_bonds = mol.GetNumBonds()
        num_atoms = mol.GetNumAtoms()
        feat_size = len(self)
        edge_matrix = None

        if self.pairwise_atom_funcs is not None:
            feat_size -= sum(self._feat_sizes[x] for x in self.pairwise_atom_funcs.keys())
        if self.featurizer_funcs is not None and len(self.featurizer_funcs) > 0:
            edge_matrix = np.zeros((num_atoms, num_atoms, feat_size))
            # Compute features for each bond
            for i in range(num_bonds):
                bond = mol.GetBondWithIdx(i)
                a_idx_1 = bond.GetBeginAtomIdx()
                a_idx_2 = bond.GetEndAtomIdx()
                bond_features = defaultdict(list)
                for feat_name, feat_func in self.featurizer_funcs.items():
                    feat = feat_func(bond)
                    bond_features[feat_name].extend([feat])
                bond_features = self._concat(bond_features)[self.name]
                edge_matrix[a_idx_1, a_idx_2] = bond_features
                edge_matrix[a_idx_2, a_idx_1] = bond_features

            edge_matrix = edge_matrix.reshape(-1, feat_size)
        if self.pairwise_atom_funcs is not None:
            pwise_features = dict()
            for pname, pfunc in self.pairwise_atom_funcs.items():
                pwise_features[pname] = pfunc(mol)
            pwise_features = self._concat(pwise_features)[self.name]
            if edge_matrix is not None:
                edge_matrix = np.concatenate([edge_matrix, pwise_features], axis=-1)
            else:
                edge_matrix = pwise_features
        if not flat:
            edge_matrix = edge_matrix.reshape(num_atoms, num_atoms, -1)
        if dtype is not None:
            edge_matrix = datatype.cast(edge_matrix, dtype=dtype)
        return {self.name: edge_matrix}

`DEFAULT_PAIRWISE_FEATURIZER = {'pairwise_2D_dist': pairwise_2D_dist, 'pairwise_ring_membership': pairwise_ring_membership}` `class-attribute` ¶

`pairwise_atom_funcs = pairwise_atom_funcs` `instance-attribute` ¶

`call(mol, dtype=None, flat=True)` ¶

Featurize all bonds in a molecule.

Parameters:

Name	Type	Description	Default
`mol`	`Union[rdchem.Mol, str]`	the molecule of interest	required
`dtype`	`Callable`	requested data type	`None`
`flat`	`bool`	whether to return a collapsed N^2, M or a N, N, M matrix	`True`

Returns:

Name	Type	Description
`dict`		For each function in self.featurizer_funcs with the key `k`, store the computed feature under the key `k`.

Source code in molfeat/calc/bond.py

def __call__(self, mol: Union[rdchem.Mol, str], dtype: Callable = None, flat: bool = True):
    """Featurize all bonds in a molecule.

    Args:
        mol: the molecule of interest
        dtype: requested data type
        flat: whether to return a collapsed N^2, M or a N, N, M matrix

    Returns:
        dict: For each function in self.featurizer_funcs with the key ``k``,
            store the computed feature under the key ``k``.
    """

    mol = dm.to_mol(mol)
    num_bonds = mol.GetNumBonds()
    num_atoms = mol.GetNumAtoms()
    feat_size = len(self)
    edge_matrix = None

    if self.pairwise_atom_funcs is not None:
        feat_size -= sum(self._feat_sizes[x] for x in self.pairwise_atom_funcs.keys())
    if self.featurizer_funcs is not None and len(self.featurizer_funcs) > 0:
        edge_matrix = np.zeros((num_atoms, num_atoms, feat_size))
        # Compute features for each bond
        for i in range(num_bonds):
            bond = mol.GetBondWithIdx(i)
            a_idx_1 = bond.GetBeginAtomIdx()
            a_idx_2 = bond.GetEndAtomIdx()
            bond_features = defaultdict(list)
            for feat_name, feat_func in self.featurizer_funcs.items():
                feat = feat_func(bond)
                bond_features[feat_name].extend([feat])
            bond_features = self._concat(bond_features)[self.name]
            edge_matrix[a_idx_1, a_idx_2] = bond_features
            edge_matrix[a_idx_2, a_idx_1] = bond_features

        edge_matrix = edge_matrix.reshape(-1, feat_size)
    if self.pairwise_atom_funcs is not None:
        pwise_features = dict()
        for pname, pfunc in self.pairwise_atom_funcs.items():
            pwise_features[pname] = pfunc(mol)
        pwise_features = self._concat(pwise_features)[self.name]
        if edge_matrix is not None:
            edge_matrix = np.concatenate([edge_matrix, pwise_features], axis=-1)
        else:
            edge_matrix = pwise_features
    if not flat:
        edge_matrix = edge_matrix.reshape(num_atoms, num_atoms, -1)
    if dtype is not None:
        edge_matrix = datatype.cast(edge_matrix, dtype=dtype)
    return {self.name: edge_matrix}

`init(featurizer_funcs=None, pairwise_atom_funcs='default', name='he')` ¶

Init function of the edge matrix property calculator

Parameters:

Name	Type	Description	Default
`featurizer_funcs`	`Union[list, dict]`	Mapping feature name to the featurization function.	`None`
`pairwise_atom_funcs`	`Union[list, dict, str]`	Mapping feature name to pairwise featurization function. Use the keywords "default" for the default values	`'default'`

Source code in molfeat/calc/bond.py

def __init__(
    self,
    featurizer_funcs: Union[list, dict] = None,
    pairwise_atom_funcs: Union[list, dict, str] = "default",
    name: str = "he",
):
    """
    Init function of the edge matrix property calculator

    Args:
        featurizer_funcs: Mapping feature name to the featurization function.
        pairwise_atom_funcs: Mapping feature name to pairwise featurization function.
            Use the keywords "default" for the default values
    """
    if pairwise_atom_funcs == "default":
        pairwise_atom_funcs = self.DEFAULT_PAIRWISE_FEATURIZER
    if not isinstance(pairwise_atom_funcs, dict):
        get_name = lambda x: getattr(x, "__name__", repr(x))
        pairwise_atom_funcs = dict((get_name(x), x) for x in pairwise_atom_funcs)
    self.pairwise_atom_funcs = pairwise_atom_funcs
    super().__init__(featurizer_funcs=featurizer_funcs, concat=True, name=name)
    # add conf data to toy mol
    self._toy_mol = dm.conformers.generate(self._toy_mol, n_confs=1, minimize_energy=False)
    for k in self.pairwise_atom_funcs.keys():
        self.feat_size(feat_name=k)

`feat_size(feat_name=None)` ¶

Get the feature size for feat_name.

Parameters:

Name	Type	Description	Default
`feat_name`	`Optional[str]`	Feature for query.	`None`

Returns:

Name	Type	Description
`int`		Feature size for the feature with name `feat_name`. Default to None.

Source code in molfeat/calc/bond.py

def feat_size(self, feat_name: Optional[str] = None):
    """Get the feature size for ``feat_name``.

    Args:
        feat_name: Feature for query.

    Returns:
        int: Feature size for the feature with name ``feat_name``. Default to None.
    """
    if feat_name not in self.featurizer_funcs and feat_name not in self.pairwise_atom_funcs:
        raise ValueError(
            "Expect feat_name to be in {}, got {}".format(
                list(self.featurizer_funcs.keys()), feat_name
            )
        )
    if feat_name not in self._feat_sizes:
        if feat_name in self.featurizer_funcs:
            bond = self._toy_mol.GetBondWithIdx(0)
            self._feat_sizes[feat_name] = len(self.featurizer_funcs[feat_name](bond))
        elif feat_name in self.pairwise_atom_funcs:
            self._feat_sizes[feat_name] = self.pairwise_atom_funcs[feat_name](
                self._toy_mol
            ).shape[-1]
        else:
            raise ValueError(f"Feature name {feat_name} is not defined !")
    return self._feat_sizes[feat_name]

`from_state_dict(state_dict, override_args=None)` `classmethod` ¶

Create an instance of an atom calculator from a state dict

Parameters:

Name	Type	Description	Default
`state_dict`		state dictionary to use to create the atom calculator	required
`overrride_args`		optional dictionary of arguments to override the ones in the state dict at construction of the new object	required

Source code in molfeat/calc/bond.py

@classmethod
def from_state_dict(cls, state_dict, override_args: Optional[dict] = None):
    """Create an instance of an atom calculator from a state dict

    Args:
        state_dict: state dictionary to use to create the atom calculator
        overrride_args: optional dictionary of arguments to override the ones in the state dict
            at construction of the new object
    """
    # EN: at this moment, version compatibility is not enforced
    cls_name = state_dict.get("name", cls.__name__)
    module_name = state_dict.get("module", cls.__module__)
    module = importlib.import_module(module_name)
    klass = getattr(module, cls_name)

    kwargs = state_dict["args"].copy()
    # now we need to unpickle the featurizer functions
    featurizer_fn_pickled = kwargs.pop("featurizer_funcs", None)
    if featurizer_fn_pickled is not None:
        featurizer_fn_loaded = {}
        for k, v in featurizer_fn_pickled.items():
            featurizer_fn_loaded[k] = hex_to_fn(v)
        kwargs["featurizer_funcs"] = featurizer_fn_loaded

    pairwise_atom_fn_pickled = kwargs.pop("pairwise_atom_funcs", None)
    if pairwise_atom_fn_pickled is not None:
        pairwise_atom_fn_loaded = {}
        for k, v in pairwise_atom_fn_pickled.items():
            pairwise_atom_fn_loaded[k] = hex_to_fn(v)
        kwargs["pairwise_atom_funcs"] = pairwise_atom_fn_loaded
    kwargs.update(**(override_args or {}))
    return klass(**kwargs)

`to_state_dict()` ¶

Convert the Atom calculator to a state dict Due to some constraints and cross-version compatibility, the featurizer functions need to be pickled and not just list

Source code in molfeat/calc/bond.py

def to_state_dict(self):
    """Convert the Atom calculator to a state dict
    Due to some constraints and cross-version compatibility,  the featurizer functions
    need to be pickled and not just list
    """
    state_dict = super().to_state_dict()
    # repeat for the pairwise one
    pairwise_atom_fn_pickled = {}
    for fname, ffunc in self.pairwise_atom_funcs.items():
        pairwise_atom_fn_pickled[fname] = fn_to_hex(ffunc)
    state_dict["args"]["pairwise_atom_funcs"] = pairwise_atom_fn_pickled
    return state_dict

Calculators¶

get_calculator(name, **params) ¶

Fingerprints¶

FPCalculator ¶

columns property ¶

counting = counting or '-count' in self.method instance-attribute ¶

input_length = length instance-attribute ¶

method = method.lower() instance-attribute ¶

params = default_params instance-attribute ¶

__call__(mol, raw=False) ¶

__getstate__() ¶

__init__(method, length=None, counting=False, **kwargs) ¶

__len__() ¶

__setstate__(state) ¶

to_state_dict() ¶

Descriptors¶

MordredDescriptors ¶

columns property ¶

do_not_standardize = do_not_standardize instance-attribute ¶

ignore_3D = ignore_3D instance-attribute ¶

replace_nan = replace_nan instance-attribute ¶

__call__(mol, conformer_id=-1) ¶

__getstate__() ¶

__init__(ignore_3D=True, replace_nan=False, do_not_standardize=False, **kwargs) ¶

__len__() ¶

__setstate__(state) ¶

RDKitDescriptors2D ¶

DESCRIPTORS_FN = {name: fn for (name, fn) in Descriptors.descList} class-attribute ¶

augment = augment instance-attribute ¶

avg_ipc = avg_ipc instance-attribute ¶

columns property ¶

descrs = descrs instance-attribute ¶

do_not_standardize = do_not_standardize instance-attribute ¶

replace_nan = replace_nan instance-attribute ¶

__call__(mol) ¶

__getstate__() ¶

__init__(replace_nan=False, augment=True, descrs=None, avg_ipc=True, do_not_standardize=False, **kwargs) ¶

__len__() ¶

RDKitDescriptors3D ¶

columns property ¶

ignore_descrs = ignore_descrs or [] instance-attribute ¶

replace_nan = replace_nan instance-attribute ¶

__call__(mol, conformer_id=-1) ¶

__getstate__() ¶

__init__(replace_nan=False, ignore_descrs=['CalcGETAWAY'], **kwargs) ¶

__len__() ¶

CATS¶

CATS ¶

DESCRIPTORS = ['DD', 'AD', 'DP', 'DN', 'DL', 'DR', 'AA', 'AP', 'AN', 'AL', 'AR', 'PP', 'NP', 'LP', 'PR', 'NN', 'LN', 'NR', 'LL', 'LR', 'RR'] class-attribute ¶

MAX_DIST_DEFAULT_2D = 8 class-attribute ¶

MAX_DIST_DEFAULT_3D = 5 class-attribute ¶

bins = list(sorted(bins)) instance-attribute ¶

columns property ¶

max_dist = max_dist instance-attribute ¶

scale = scale instance-attribute ¶

use_3d_distances = use_3d_distances instance-attribute ¶

__call__(mol, conformer_id=-1) ¶

__getstate__() ¶

__init__(max_dist=None, bins=None, scale='raw', use_3d_distances=False, **kwargs) ¶

__len__() ¶

__setstate__(state) ¶

Pharmacophore¶

Pharmacophore2D ¶

bins = bins instance-attribute ¶

columns property ¶

factory = factory instance-attribute ¶

feature_factory property ¶

includeBondOrder = includeBondOrder instance-attribute ¶

length = length instance-attribute ¶

maxPointCount = maxPointCount instance-attribute ¶

minPointCount = minPointCount instance-attribute ¶

shortestPathsOnly = shortestPathsOnly instance-attribute ¶

skipFeats = skipFeats instance-attribute ¶

trianglePruneBins = trianglePruneBins instance-attribute ¶

useCounts = useCounts instance-attribute ¶

__call__(mol, raw=False) ¶

__getstate__() ¶

__init__(factory='pmapper', length=2048, useCounts=None, minPointCount=None, maxPointCount=None, shortestPathsOnly=None, includeBondOrder=None, skipFeats=None, trianglePruneBins=None, bins=None, **kwargs) ¶

__len__() ¶

__setstate__(state) ¶

`Calculators`¶

`get_calculator(name, **params)` ¶

`Fingerprints`¶

`FPCalculator` ¶

`columns` `property` ¶

`counting = counting or '-count' in self.method` `instance-attribute` ¶

`input_length = length` `instance-attribute` ¶

`method = method.lower()` `instance-attribute` ¶

`params = default_params` `instance-attribute` ¶

`call(mol, raw=False)` ¶

`getstate()` ¶

`init(method, length=None, counting=False, **kwargs)` ¶

`len()` ¶

`setstate(state)` ¶

`to_state_dict()` ¶

`Descriptors`¶

`MordredDescriptors` ¶

`columns` `property` ¶

`do_not_standardize = do_not_standardize` `instance-attribute` ¶

`ignore_3D = ignore_3D` `instance-attribute` ¶

`replace_nan = replace_nan` `instance-attribute` ¶

`call(mol, conformer_id=-1)` ¶

`getstate()` ¶

`init(ignore_3D=True, replace_nan=False, do_not_standardize=False, **kwargs)` ¶

`len()` ¶

`setstate(state)` ¶

`RDKitDescriptors2D` ¶

`DESCRIPTORS_FN = {name: fn for (name, fn) in Descriptors.descList}` `class-attribute` ¶

`augment = augment` `instance-attribute` ¶

`avg_ipc = avg_ipc` `instance-attribute` ¶

`columns` `property` ¶

`descrs = descrs` `instance-attribute` ¶

`do_not_standardize = do_not_standardize` `instance-attribute` ¶

`replace_nan = replace_nan` `instance-attribute` ¶

`call(mol)` ¶

`getstate()` ¶

`init(replace_nan=False, augment=True, descrs=None, avg_ipc=True, do_not_standardize=False, **kwargs)` ¶

`len()` ¶

`RDKitDescriptors3D` ¶

`columns` `property` ¶

`ignore_descrs = ignore_descrs or []` `instance-attribute` ¶

`replace_nan = replace_nan` `instance-attribute` ¶

`call(mol, conformer_id=-1)` ¶

`getstate()` ¶

`init(replace_nan=False, ignore_descrs=['CalcGETAWAY'], **kwargs)` ¶

`len()` ¶

`CATS`¶

`CATS` ¶

`DESCRIPTORS = ['DD', 'AD', 'DP', 'DN', 'DL', 'DR', 'AA', 'AP', 'AN', 'AL', 'AR', 'PP', 'NP', 'LP', 'PR', 'NN', 'LN', 'NR', 'LL', 'LR', 'RR']` `class-attribute` ¶

`MAX_DIST_DEFAULT_2D = 8` `class-attribute` ¶

`MAX_DIST_DEFAULT_3D = 5` `class-attribute` ¶

`bins = list(sorted(bins))` `instance-attribute` ¶

`columns` `property` ¶

`max_dist = max_dist` `instance-attribute` ¶

`scale = scale` `instance-attribute` ¶

`use_3d_distances = use_3d_distances` `instance-attribute` ¶

`call(mol, conformer_id=-1)` ¶

`getstate()` ¶

`init(max_dist=None, bins=None, scale='raw', use_3d_distances=False, **kwargs)` ¶

`len()` ¶

`setstate(state)` ¶

`Pharmacophore`¶

`Pharmacophore2D` ¶

`bins = bins` `instance-attribute` ¶

`columns` `property` ¶

`factory = factory` `instance-attribute` ¶

`feature_factory` `property` ¶

`includeBondOrder = includeBondOrder` `instance-attribute` ¶

`length = length` `instance-attribute` ¶

`maxPointCount = maxPointCount` `instance-attribute` ¶

`minPointCount = minPointCount` `instance-attribute` ¶

`shortestPathsOnly = shortestPathsOnly` `instance-attribute` ¶

`skipFeats = skipFeats` `instance-attribute` ¶

`trianglePruneBins = trianglePruneBins` `instance-attribute` ¶

`useCounts = useCounts` `instance-attribute` ¶

`call(mol, raw=False)` ¶

`getstate()` ¶

`init(factory='pmapper', length=2048, useCounts=None, minPointCount=None, maxPointCount=None, shortestPathsOnly=None, includeBondOrder=None, skipFeats=None, trianglePruneBins=None, bins=None, **kwargs)` ¶

`len()` ¶

`setstate(state)` ¶