Skip to content

molfeat.trans.fp

FPVecFilteredTransformer

Bases: FPVecTransformer

Fingerprint molecule transformer with columns filters applying to the featurized vector when fit is called

Source code in molfeat/trans/fp.py
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
class FPVecFilteredTransformer(FPVecTransformer):
    r"""
    Fingerprint molecule transformer with columns filters applying to the featurized vector when `fit` is called
    """

    def __init__(
        self,
        kind: str = "ecfp:4",
        length: int = 2000,
        occ_threshold: float = 0,
        del_invariant: bool = False,
        n_jobs: int = 1,
        verbose: bool = False,
        dtype: Callable = np.float32,
        **params,
    ):
        """Molecular to vector featurization with filtering applied

        Args:
            kind (str, optional): Name of the fingerprints (one supported fingerprints: see self.AVAILABLE_FPS). Defaults to "ecfp4".
            length (int, optional): Length of the fingerprint. Defaults to 2000.
            occ_threshold (float, optional): Minimum proportion a columns need to be non null to be kept.
            del_invariant (bool, optional): Whether to delete columns that are invariant.
            n_jobs (int, optional): Number of jobs. Defaults to 1.
            verbose (bool, optional): Verbosity level. Defaults to False.
            dtype (Callable, optional): Data type. Defaults to np.float32.
            params (dict, optional): Any additional parameters to the fingerprint function
        """

        super().__init__(
            kind=kind,
            length=length,
            n_jobs=n_jobs,
            verbose=verbose,
            dtype=dtype,
            **params,
        )
        self.occ_threshold = occ_threshold
        self.del_invariant = del_invariant
        self._input_params.update(occ_threshold=occ_threshold, del_invariant=del_invariant)

    def _update_params(self):
        params = copy.deepcopy(self._input_params)
        params.pop("featurizer", None)
        params.pop("length", None)
        params.pop("kind", None)
        params.pop("verbose", None)
        params.pop("dtype", None)
        params.pop("n_jobs", None)
        params.pop("occ_threshold", None)
        params.pop("del_invariant", None)
        self.featurizer = self._prepare_featurizer(self.kind, self.length, **params)

    def __repr__(self):
        return "{} (kind={}, length={}, occ_threshold={}, del_invariant={}, dtype={})".format(
            self.__class__.__name__,
            _parse_to_evaluable_str(self.kind),
            _parse_to_evaluable_str(self.length),
            _parse_to_evaluable_str(self.occ_threshold),
            _parse_to_evaluable_str(self.del_invariant),
            _parse_to_evaluable_str(self.dtype),
        )

    def fit(self, X: List[Union[dm.Mol, str]], y: Optional[list] = None, **fit_params):
        """Fit the current transformer on given dataset.

        The goal of fitting is for example to identify nan columns values
        that needs to be removed from the dataset

        Args:
            X: input list of molecules
            y (list, optional): Optional list of molecular properties. Defaults to None.

        Returns:
            self: MolTransformer instance after fitting
        """

        feats = self.transform(X, ignore_errors=True)
        lengths = [len(x) for x in feats if not datatype.is_null(x)]

        if lengths:
            # we will ignore all nan
            feats = datatype.to_numpy([f for f in feats if not datatype.is_null(f)])
            # all nan columns
            unwanted_columns = []
            # let's adjsut occ to float
            occ_threshold = self.occ_threshold
            if occ_threshold > 1:
                occ_threshold = occ_threshold / feats.shape[0]
            # not nan
            unwanted_columns.append(~np.any(np.isnan(feats), axis=0))
            # not enough set bits
            unwanted_columns.append(
                (np.count_nonzero(feats, axis=0) / feats.shape[0]) > occ_threshold
            )
            if self.del_invariant:
                unwanted_columns.append(~np.all(feats == feats[0, :], axis=0))
            self.cols_to_keep = (np.logical_and.reduce(unwanted_columns)).nonzero()[0]
        self._fitted = True
        return self

__init__(kind='ecfp:4', length=2000, occ_threshold=0, del_invariant=False, n_jobs=1, verbose=False, dtype=np.float32, **params)

Molecular to vector featurization with filtering applied

Parameters:

Name Type Description Default
kind str

Name of the fingerprints (one supported fingerprints: see self.AVAILABLE_FPS). Defaults to "ecfp4".

'ecfp:4'
length int

Length of the fingerprint. Defaults to 2000.

2000
occ_threshold float

Minimum proportion a columns need to be non null to be kept.

0
del_invariant bool

Whether to delete columns that are invariant.

False
n_jobs int

Number of jobs. Defaults to 1.

1
verbose bool

Verbosity level. Defaults to False.

False
dtype Callable

Data type. Defaults to np.float32.

float32
params dict

Any additional parameters to the fingerprint function

{}
Source code in molfeat/trans/fp.py
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
def __init__(
    self,
    kind: str = "ecfp:4",
    length: int = 2000,
    occ_threshold: float = 0,
    del_invariant: bool = False,
    n_jobs: int = 1,
    verbose: bool = False,
    dtype: Callable = np.float32,
    **params,
):
    """Molecular to vector featurization with filtering applied

    Args:
        kind (str, optional): Name of the fingerprints (one supported fingerprints: see self.AVAILABLE_FPS). Defaults to "ecfp4".
        length (int, optional): Length of the fingerprint. Defaults to 2000.
        occ_threshold (float, optional): Minimum proportion a columns need to be non null to be kept.
        del_invariant (bool, optional): Whether to delete columns that are invariant.
        n_jobs (int, optional): Number of jobs. Defaults to 1.
        verbose (bool, optional): Verbosity level. Defaults to False.
        dtype (Callable, optional): Data type. Defaults to np.float32.
        params (dict, optional): Any additional parameters to the fingerprint function
    """

    super().__init__(
        kind=kind,
        length=length,
        n_jobs=n_jobs,
        verbose=verbose,
        dtype=dtype,
        **params,
    )
    self.occ_threshold = occ_threshold
    self.del_invariant = del_invariant
    self._input_params.update(occ_threshold=occ_threshold, del_invariant=del_invariant)

fit(X, y=None, **fit_params)

Fit the current transformer on given dataset.

The goal of fitting is for example to identify nan columns values that needs to be removed from the dataset

Parameters:

Name Type Description Default
X List[Union[Mol, str]]

input list of molecules

required
y list

Optional list of molecular properties. Defaults to None.

None

Returns:

Name Type Description
self

MolTransformer instance after fitting

Source code in molfeat/trans/fp.py
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
def fit(self, X: List[Union[dm.Mol, str]], y: Optional[list] = None, **fit_params):
    """Fit the current transformer on given dataset.

    The goal of fitting is for example to identify nan columns values
    that needs to be removed from the dataset

    Args:
        X: input list of molecules
        y (list, optional): Optional list of molecular properties. Defaults to None.

    Returns:
        self: MolTransformer instance after fitting
    """

    feats = self.transform(X, ignore_errors=True)
    lengths = [len(x) for x in feats if not datatype.is_null(x)]

    if lengths:
        # we will ignore all nan
        feats = datatype.to_numpy([f for f in feats if not datatype.is_null(f)])
        # all nan columns
        unwanted_columns = []
        # let's adjsut occ to float
        occ_threshold = self.occ_threshold
        if occ_threshold > 1:
            occ_threshold = occ_threshold / feats.shape[0]
        # not nan
        unwanted_columns.append(~np.any(np.isnan(feats), axis=0))
        # not enough set bits
        unwanted_columns.append(
            (np.count_nonzero(feats, axis=0) / feats.shape[0]) > occ_threshold
        )
        if self.del_invariant:
            unwanted_columns.append(~np.all(feats == feats[0, :], axis=0))
        self.cols_to_keep = (np.logical_and.reduce(unwanted_columns)).nonzero()[0]
    self._fitted = True
    return self

FPVecTransformer

Bases: MoleculeTransformer

Molecular fingerprinter that computes various fingerprints and descriptors regularly used in QSAR modeling.

Note

For fingerprints with a radius parameter, you can provide the radius using the notation "fp:radius". For example "Morgan Circular 2" can be written as "morgan:2". Under the hood, morgan and ecfp fingerprints are equated with the proper radius/diameter adjustment.

For counting fingerprints, you just need to add the '-count' suffix to the name of the fingerprint. For example: "morgan-count:2"

Source code in molfeat/trans/fp.py
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
class FPVecTransformer(MoleculeTransformer):
    r"""
    Molecular fingerprinter that computes various fingerprints and descriptors regularly used in QSAR modeling.

    !!! note
        For fingerprints with a radius parameter, you can provide the radius using the notation "fp:radius".
        For example "Morgan Circular 2" can be written as "morgan:2". Under the hood, morgan and ecfp fingerprints
        are equated with the proper radius/diameter adjustment.

        For counting fingerprints, you just need to add the '-count' suffix to the name of the fingerprint. For example:
        "morgan-count:2"
    """

    AVAILABLE_FPS = list(FP_FUNCS.keys()) + [
        "desc3D",
        "desc2D",
        "mordred",
        "cats2D",
        "cats3D",
        "pharm2D",
        "pharm3D",
        "scaffoldkeys",
        "skeys",
        "electroshape",
        "usr",
        "usrcat",
    ]

    def __init__(
        self,
        kind: str = "ecfp:4",
        length: int = 2000,
        n_jobs: int = 1,
        verbose: bool = False,
        dtype: Callable = np.float32,
        parallel_kwargs: Optional[dict] = None,
        **params,
    ):
        """Molecular to vector fingerprinter

        Args:
            kind (str, optional): Name of the fingerprints (one supported fingerprints: see self.AVAILABLE_FPS). Defaults to "ecfp4".
            length (int, optional): Length of the fingerprint. Defaults to 2000.
            n_jobs (int, optional): Number of jobs. Defaults to 1.
            verbose (bool, optional): Verbosity level. Defaults to False.
            dtype (Callable, optional): Data type. Defaults to np.float32.
            parallel_kwargs (dict, optional): Optional arguments to pass to dm.parallelized when required. Defaults to None.
            params (dict, optional): Any additional parameters to the fingerprint function
        """
        self._save_input_args()

        # remove any featurizer that was passed as argument
        params.pop("featurizer", None)
        self._feat_params = params
        featurizer = self._prepare_featurizer(kind, length, **params)
        super().__init__(
            featurizer=featurizer,
            n_jobs=n_jobs,
            verbose=verbose,
            dtype=dtype,
            parallel_kwargs=parallel_kwargs,
            **params,
        )
        self.kind = kind
        self.length = length
        self._length = None
        # update length for featurizer that have they fixed length
        # EN: setting up a protected _length function helps to bypass
        # the absurd "is" comparison done by sklearn in clone
        # note that the featurizer length would likely be ignored by featurizer
        # that do not support a variable length
        if hasattr(self.featurizer, "__len__"):
            self._length = len(featurizer)
        self._input_params.update(kind=kind, length=length)
        if self.kind.lower() in _UNSERIALIZABLE_FPS:
            self.parallel_kwargs.update(scheduler="threads")

    def __len__(self):
        """Compute featurizer length"""
        if getattr(self, "cols_to_keep", None) is None and self._length is not None:
            return self._length
        return super().__len__()

    def _get_param_names(self):
        """Get parameter names for the estimator"""
        out = self._input_params.keys()
        return [x for x in out if x != "featurizer"]

    @classmethod
    def _prepare_featurizer(cls, kind: str, length: int, **params):
        """Prepare featurizer from its name and parameters

        Args:
            kind: Name of the featurizer
            length: Length of the featurizer
        Returns:
            calculator (Callable): fingerprint calculator
        """
        match = re.search(r":(\d+)$", kind)
        radius = None
        if match is not None:
            radius = match.group(1)
        if radius is not None:
            kind = kind.replace(radius, "").strip(":").lower()
            radius = int(radius)
            if any(x in kind for x in ["ecfp", "fcfp"]):
                radius = max(radius // 2, 1)
            params["radius"] = radius
        if any(x in kind for x in ["morgan", "morgan_circular", "morgan-circular"]):
            kind.replace("_circular", "").replace("-circular", "").replace("morgan", "ecfp")
        if kind not in cls.AVAILABLE_FPS:
            raise ValueError(f"{kind} is not a valid featurizer")
        params["length"] = length

        return get_calculator(kind, **params)

    def _update_params(self):
        params = copy.deepcopy(self._input_params)
        params.pop("featurizer", None)
        params.pop("length", None)
        params.pop("kind", None)
        params.pop("verbose", None)
        params.pop("dtype", None)
        params.pop("n_jobs", None)
        self._fitted = False
        self.featurizer = self._prepare_featurizer(self.kind, self.length, **params)

    def __repr__(self):
        return "{}(kind={}, length={}, dtype={})".format(
            self.__class__.__name__,
            _parse_to_evaluable_str(self.kind),
            _parse_to_evaluable_str(self.length),
            _parse_to_evaluable_str(self.dtype),
        )

    def __str__(self):
        # The output for the print function
        return self.__repr__()

    def __eq__(self, other):
        same_type = type(self) == type(other)
        return same_type and all(
            [getattr(other, k) == v for k, v in self.get_params() if not callable(v)]
        )

    def __ne__(self, other):
        return not (self == other)

    def __hash__(self):
        return hash(repr(self))

__init__(kind='ecfp:4', length=2000, n_jobs=1, verbose=False, dtype=np.float32, parallel_kwargs=None, **params)

Molecular to vector fingerprinter

Parameters:

Name Type Description Default
kind str

Name of the fingerprints (one supported fingerprints: see self.AVAILABLE_FPS). Defaults to "ecfp4".

'ecfp:4'
length int

Length of the fingerprint. Defaults to 2000.

2000
n_jobs int

Number of jobs. Defaults to 1.

1
verbose bool

Verbosity level. Defaults to False.

False
dtype Callable

Data type. Defaults to np.float32.

float32
parallel_kwargs dict

Optional arguments to pass to dm.parallelized when required. Defaults to None.

None
params dict

Any additional parameters to the fingerprint function

{}
Source code in molfeat/trans/fp.py
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
def __init__(
    self,
    kind: str = "ecfp:4",
    length: int = 2000,
    n_jobs: int = 1,
    verbose: bool = False,
    dtype: Callable = np.float32,
    parallel_kwargs: Optional[dict] = None,
    **params,
):
    """Molecular to vector fingerprinter

    Args:
        kind (str, optional): Name of the fingerprints (one supported fingerprints: see self.AVAILABLE_FPS). Defaults to "ecfp4".
        length (int, optional): Length of the fingerprint. Defaults to 2000.
        n_jobs (int, optional): Number of jobs. Defaults to 1.
        verbose (bool, optional): Verbosity level. Defaults to False.
        dtype (Callable, optional): Data type. Defaults to np.float32.
        parallel_kwargs (dict, optional): Optional arguments to pass to dm.parallelized when required. Defaults to None.
        params (dict, optional): Any additional parameters to the fingerprint function
    """
    self._save_input_args()

    # remove any featurizer that was passed as argument
    params.pop("featurizer", None)
    self._feat_params = params
    featurizer = self._prepare_featurizer(kind, length, **params)
    super().__init__(
        featurizer=featurizer,
        n_jobs=n_jobs,
        verbose=verbose,
        dtype=dtype,
        parallel_kwargs=parallel_kwargs,
        **params,
    )
    self.kind = kind
    self.length = length
    self._length = None
    # update length for featurizer that have they fixed length
    # EN: setting up a protected _length function helps to bypass
    # the absurd "is" comparison done by sklearn in clone
    # note that the featurizer length would likely be ignored by featurizer
    # that do not support a variable length
    if hasattr(self.featurizer, "__len__"):
        self._length = len(featurizer)
    self._input_params.update(kind=kind, length=length)
    if self.kind.lower() in _UNSERIALIZABLE_FPS:
        self.parallel_kwargs.update(scheduler="threads")

__len__()

Compute featurizer length

Source code in molfeat/trans/fp.py
 96
 97
 98
 99
100
def __len__(self):
    """Compute featurizer length"""
    if getattr(self, "cols_to_keep", None) is None and self._length is not None:
        return self._length
    return super().__len__()