Skip to content

molfeat.trans.concat

FeatConcat

Bases: list, BaseEstimator

Concatenation container for FPVecTransformer. This class allows merging multiple fingerprints into a single one. It gives the ability to call the following methods - fit - transform - fit_transform on a list of transformers and concatenate the results.

Note

The featurization length of this featurizer is accessible via the length property. len() will return the number of base featurizer.

Source code in molfeat/trans/concat.py
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
class FeatConcat(list, BaseEstimator):
    r"""
    Concatenation container for `FPVecTransformer`. This class allows
    merging multiple fingerprints into a single one.
    It gives the ability to call the following methods
        - `fit`
        - `transform`
        - `fit_transform`
    on a list of transformers and concatenate the results.

    !!! note
        The featurization length of this featurizer is accessible via the `length` property.
        `len()` will return the number of base featurizer.
    """

    _STR_DELIMITER = "||"

    def __init__(
        self,
        iterable: Optional[Union[Iterable, str]] = None,
        dtype: Optional[Callable] = None,
        params: Optional[Dict[str, Any]] = None,
        collate_fn: Optional[Callable] = None,
    ):
        """Featurizer concatenator

        Args:
            iterable: List of featurizer to concatenate.
            dtype: Datatype of the computed fingerprint
            params: Optional dictionary of parameters for the featurizers when there is a need for initializing them
            collate_fn: optional function to provide for custom collating.
                By default the collate function will be None, which will use the torch default
        """
        super().__init__()
        self.params = params or {}
        if isinstance(iterable, str):
            iterable = [x.strip() for x in iterable.split(self._STR_DELIMITER)]
        if iterable is not None:
            for item in iterable:
                if isinstance(item, str):
                    item = FPVecTransformer(kind=item, **self.params.get(item, {}))
                self.append(item)
        self.dtype = dtype
        self._length = None
        self.collate_fn = collate_fn

    def append(self, item):
        r"""Override the ``append`` to accept only ``FPVecTransformer``"""
        self._check_supported(item)
        super().append(item)

    def insert(self, index, item):
        r"""Override the ``insert`` to accept only ``BaseFeaturizer``"""
        self._check_suported(item)
        super().insert(index, item)

    def __add__(self, item):
        """Override the `__add__` method"""
        self._check_supported(item)
        super().__add__(item)

    def __setitem__(self, index, value):
        """Override the `__setitem__`  method"""
        self._check_supported(value)
        super().__setitem__(index, value)

    @property
    def length(self):
        """
        Length property for Feat concatenation.  This is the sum of the length of each transformer.
        Note that __len__ returns the number of base featurizer here instead.
        """
        if self._length is None:
            full_length = 0
            for feat in self:
                if isinstance(feat, FeatConcat):
                    full_length += feat.length
                else:
                    full_length += len(feat)
            self._length = full_length
        return self._length

    def _check_supported(self, item):
        r"""Check if the item is the right type"""
        if not isinstance(item, FPVecTransformer):
            raise ValueError("FPVecTransformer allowed only, provided {}".format(type(item)))

    def get_collate_fn(self, *args, **kwargs):
        """
        Get collate function of this featurizer. The FeatConcat featurizer use the default
        collate function which does not do anything.

        Returns:
            fn: Collate function for pytorch or None
        """
        return getattr(self, "collate_fn", None)

    def iter_index(self, indexes: Union[int, Iterator[int]]):
        r"""
        Allow the `FeatConcat` to be indexed using a list, or any other iterable.

        Args:
            indexes: The indexes to index the ``FeatConcat``.

        Returns
            indexed_fpconcat: A new FeatConcat object with the indexed element
        """
        if not isinstance(indexes, (list, tuple)):
            try:
                indexes = list(indexes)
            except TypeError:
                indexes = [indexes]
        return FeatConcat([self[ii] for ii in indexes])

    @property
    def columns(self):
        """Get the list of columns for the concatenated molecules

        Returns:
            columns (list): Name of the columns of the descriptor
        """
        tmp_mol = dm.to_mol("CC(C)O")
        columns = []
        for fp in self:
            fp_columns = getattr(fp, "columns", None)
            fp_name = str(fp)
            if fp_columns is None:
                fp_out, _ = fp([tmp_mol])
                fp_out = np.asarray(fp_out)
                fp_columns = [f"{fp_name}:{ind}" for ind in range(fp_out.shape[-1])]
            columns.extend(fp_columns)
        return columns

    def transform(self, mols: List[Union[dm.Mol, str]], **kwargs):
        r"""
        Calls the ``FPVecTransformer.transform`` for each transformer in
        the current list, and concatenates the resulting fingerprints.

        Args:
            mols: List of SMILES or molecules
            kwargs: named parameters for transform (see below)

        Returns:
            fps: Computed fingerprints of size NxD, where D is the
                sum of the length of each transformer and N is the number of input
                molecules that have been successfully featurized.
        """

        fps = []
        for _, fp_trans in enumerate(self):
            out = fp_trans.transform(mols, enforce_dtype=False, **kwargs)
            out = datatype.cast(out, dtype="pandas")
            fps.append(out)
        fps = pd.concat(fps, axis=1)
        fps.columns = self.columns
        return fps.values

    def __call__(
        self,
        mols: List[Union[dm.Mol, str]],
        enforce_dtype: bool = False,
        ignore_errors: bool = False,
        **kwargs,
    ):
        r"""
        Calls each of the internal transformer and concatenate results only on valid indices.

        Args:
            mols:  Mol or SMILES of the molecules to be transformed
            enforce_dtype: whether to enforce the instance dtype in the generated fingerprint
            ignore_errors: Whether to ignore errors during featurization or raise an error.
            kwargs: Named parameters for the transform method

        Returns:

            fp: array
                computed fingerprints of size NxD, where D is the
                sum of the length of each transformer and N is the number of input
                molecules that have been successfully featurized.
            idx: array
                Indices of successful featurization given the original molecule input.
        """

        fps = []
        valid_idx = np.zeros(len(mols))
        for _, transf in enumerate(self):
            fp = transf.transform(mols, ignore_errors=ignore_errors, **kwargs)
            fp, idx = transf._filter_none(fp)
            fps.append(fp)
            valid_idx[idx] += 1  # increase counter of valids
        valid_idx = np.nonzero(valid_idx == len(self))[0]
        fps = np.concatenate(fps, axis=1)
        if self.dtype is not None and enforce_dtype:
            fps = datatype.cast(fps, dtype=self.dtype, columns=self.columns)
        if not ignore_errors:
            return fps
        return fps, list(valid_idx)

    def fit_transform(
        self,
        mols: List[Union[str, dm.Mol]],
        y: Optional[Iterable] = None,
        fit_kwargs: Dict = None,
        trans_kwargs: Dict = None,
    ):
        r"""
        Calls the ``self.fit`` followed by the ``fit.transform`` for each transfomer in
        the current list, and concatenates the resulting fingerprints.

        Args:
            mols: List of SMILES or molecules
            y: target for the fitting. Usually ignored for FPVecTransformer
            fit_kwargs:  named parameters for fit
            fit_kwargs:named parameters for transform

        Returns:

            fp: computed fingerprints of size NxD, where D is the
                sum of the length of each transformer and N is the number of input
                molecules that have been successfully featurized.
        """
        fit_kwargs = {} if fit_kwargs is None else fit_kwargs
        trans_kwargs = {} if trans_kwargs is None else trans_kwargs
        self.fit(mols, y=y, **fit_kwargs)
        return self.transform(mols, **trans_kwargs)

    def fit(self, X: List[Union[dm.Mol, str]], y=None, **kwargs):
        r"""
        Calls the ``FPVecTransformer.fit`` for each transformer in the current list.

        Args:
            X: input list of molecules
            y (list, optional): Optional list of molecular properties. Defaults to None.

        Returns:
            self: FeatConcat instance after fitting
        """

        for _, fp_trans in enumerate(self):
            fp_trans.fit(X, y=y, **kwargs)
        return self

columns property

Get the list of columns for the concatenated molecules

Returns:

Name Type Description
columns list

Name of the columns of the descriptor

length property

Length property for Feat concatenation. This is the sum of the length of each transformer. Note that len returns the number of base featurizer here instead.

__add__(item)

Override the __add__ method

Source code in molfeat/trans/concat.py
75
76
77
78
def __add__(self, item):
    """Override the `__add__` method"""
    self._check_supported(item)
    super().__add__(item)

__call__(mols, enforce_dtype=False, ignore_errors=False, **kwargs)

Calls each of the internal transformer and concatenate results only on valid indices.

Parameters:

Name Type Description Default
mols List[Union[Mol, str]]

Mol or SMILES of the molecules to be transformed

required
enforce_dtype bool

whether to enforce the instance dtype in the generated fingerprint

False
ignore_errors bool

Whether to ignore errors during featurization or raise an error.

False
kwargs

Named parameters for the transform method

{}

Returns:

fp: array
    computed fingerprints of size NxD, where D is the
    sum of the length of each transformer and N is the number of input
    molecules that have been successfully featurized.
idx: array
    Indices of successful featurization given the original molecule input.
Source code in molfeat/trans/concat.py
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
def __call__(
    self,
    mols: List[Union[dm.Mol, str]],
    enforce_dtype: bool = False,
    ignore_errors: bool = False,
    **kwargs,
):
    r"""
    Calls each of the internal transformer and concatenate results only on valid indices.

    Args:
        mols:  Mol or SMILES of the molecules to be transformed
        enforce_dtype: whether to enforce the instance dtype in the generated fingerprint
        ignore_errors: Whether to ignore errors during featurization or raise an error.
        kwargs: Named parameters for the transform method

    Returns:

        fp: array
            computed fingerprints of size NxD, where D is the
            sum of the length of each transformer and N is the number of input
            molecules that have been successfully featurized.
        idx: array
            Indices of successful featurization given the original molecule input.
    """

    fps = []
    valid_idx = np.zeros(len(mols))
    for _, transf in enumerate(self):
        fp = transf.transform(mols, ignore_errors=ignore_errors, **kwargs)
        fp, idx = transf._filter_none(fp)
        fps.append(fp)
        valid_idx[idx] += 1  # increase counter of valids
    valid_idx = np.nonzero(valid_idx == len(self))[0]
    fps = np.concatenate(fps, axis=1)
    if self.dtype is not None and enforce_dtype:
        fps = datatype.cast(fps, dtype=self.dtype, columns=self.columns)
    if not ignore_errors:
        return fps
    return fps, list(valid_idx)

__init__(iterable=None, dtype=None, params=None, collate_fn=None)

Featurizer concatenator

Parameters:

Name Type Description Default
iterable Optional[Union[Iterable, str]]

List of featurizer to concatenate.

None
dtype Optional[Callable]

Datatype of the computed fingerprint

None
params Optional[Dict[str, Any]]

Optional dictionary of parameters for the featurizers when there is a need for initializing them

None
collate_fn Optional[Callable]

optional function to provide for custom collating. By default the collate function will be None, which will use the torch default

None
Source code in molfeat/trans/concat.py
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
def __init__(
    self,
    iterable: Optional[Union[Iterable, str]] = None,
    dtype: Optional[Callable] = None,
    params: Optional[Dict[str, Any]] = None,
    collate_fn: Optional[Callable] = None,
):
    """Featurizer concatenator

    Args:
        iterable: List of featurizer to concatenate.
        dtype: Datatype of the computed fingerprint
        params: Optional dictionary of parameters for the featurizers when there is a need for initializing them
        collate_fn: optional function to provide for custom collating.
            By default the collate function will be None, which will use the torch default
    """
    super().__init__()
    self.params = params or {}
    if isinstance(iterable, str):
        iterable = [x.strip() for x in iterable.split(self._STR_DELIMITER)]
    if iterable is not None:
        for item in iterable:
            if isinstance(item, str):
                item = FPVecTransformer(kind=item, **self.params.get(item, {}))
            self.append(item)
    self.dtype = dtype
    self._length = None
    self.collate_fn = collate_fn

__setitem__(index, value)

Override the __setitem__ method

Source code in molfeat/trans/concat.py
80
81
82
83
def __setitem__(self, index, value):
    """Override the `__setitem__`  method"""
    self._check_supported(value)
    super().__setitem__(index, value)

append(item)

Override the append to accept only FPVecTransformer

Source code in molfeat/trans/concat.py
65
66
67
68
def append(self, item):
    r"""Override the ``append`` to accept only ``FPVecTransformer``"""
    self._check_supported(item)
    super().append(item)

fit(X, y=None, **kwargs)

Calls the FPVecTransformer.fit for each transformer in the current list.

Parameters:

Name Type Description Default
X List[Union[Mol, str]]

input list of molecules

required
y list

Optional list of molecular properties. Defaults to None.

None

Returns:

Name Type Description
self

FeatConcat instance after fitting

Source code in molfeat/trans/concat.py
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
def fit(self, X: List[Union[dm.Mol, str]], y=None, **kwargs):
    r"""
    Calls the ``FPVecTransformer.fit`` for each transformer in the current list.

    Args:
        X: input list of molecules
        y (list, optional): Optional list of molecular properties. Defaults to None.

    Returns:
        self: FeatConcat instance after fitting
    """

    for _, fp_trans in enumerate(self):
        fp_trans.fit(X, y=y, **kwargs)
    return self

fit_transform(mols, y=None, fit_kwargs=None, trans_kwargs=None)

Calls the self.fit followed by the fit.transform for each transfomer in the current list, and concatenates the resulting fingerprints.

Parameters:

Name Type Description Default
mols List[Union[str, Mol]]

List of SMILES or molecules

required
y Optional[Iterable]

target for the fitting. Usually ignored for FPVecTransformer

None
fit_kwargs Dict

named parameters for fit

None
fit_kwargs Dict

named parameters for transform

None

Returns:

fp: computed fingerprints of size NxD, where D is the
    sum of the length of each transformer and N is the number of input
    molecules that have been successfully featurized.
Source code in molfeat/trans/concat.py
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
def fit_transform(
    self,
    mols: List[Union[str, dm.Mol]],
    y: Optional[Iterable] = None,
    fit_kwargs: Dict = None,
    trans_kwargs: Dict = None,
):
    r"""
    Calls the ``self.fit`` followed by the ``fit.transform`` for each transfomer in
    the current list, and concatenates the resulting fingerprints.

    Args:
        mols: List of SMILES or molecules
        y: target for the fitting. Usually ignored for FPVecTransformer
        fit_kwargs:  named parameters for fit
        fit_kwargs:named parameters for transform

    Returns:

        fp: computed fingerprints of size NxD, where D is the
            sum of the length of each transformer and N is the number of input
            molecules that have been successfully featurized.
    """
    fit_kwargs = {} if fit_kwargs is None else fit_kwargs
    trans_kwargs = {} if trans_kwargs is None else trans_kwargs
    self.fit(mols, y=y, **fit_kwargs)
    return self.transform(mols, **trans_kwargs)

get_collate_fn(*args, **kwargs)

Get collate function of this featurizer. The FeatConcat featurizer use the default collate function which does not do anything.

Returns:

Name Type Description
fn

Collate function for pytorch or None

Source code in molfeat/trans/concat.py
106
107
108
109
110
111
112
113
114
def get_collate_fn(self, *args, **kwargs):
    """
    Get collate function of this featurizer. The FeatConcat featurizer use the default
    collate function which does not do anything.

    Returns:
        fn: Collate function for pytorch or None
    """
    return getattr(self, "collate_fn", None)

insert(index, item)

Override the insert to accept only BaseFeaturizer

Source code in molfeat/trans/concat.py
70
71
72
73
def insert(self, index, item):
    r"""Override the ``insert`` to accept only ``BaseFeaturizer``"""
    self._check_suported(item)
    super().insert(index, item)

iter_index(indexes)

Allow the FeatConcat to be indexed using a list, or any other iterable.

Parameters:

Name Type Description Default
indexes Union[int, Iterator[int]]

The indexes to index the FeatConcat.

required

Returns indexed_fpconcat: A new FeatConcat object with the indexed element

Source code in molfeat/trans/concat.py
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
def iter_index(self, indexes: Union[int, Iterator[int]]):
    r"""
    Allow the `FeatConcat` to be indexed using a list, or any other iterable.

    Args:
        indexes: The indexes to index the ``FeatConcat``.

    Returns
        indexed_fpconcat: A new FeatConcat object with the indexed element
    """
    if not isinstance(indexes, (list, tuple)):
        try:
            indexes = list(indexes)
        except TypeError:
            indexes = [indexes]
    return FeatConcat([self[ii] for ii in indexes])

transform(mols, **kwargs)

Calls the FPVecTransformer.transform for each transformer in the current list, and concatenates the resulting fingerprints.

Parameters:

Name Type Description Default
mols List[Union[Mol, str]]

List of SMILES or molecules

required
kwargs

named parameters for transform (see below)

{}

Returns:

Name Type Description
fps

Computed fingerprints of size NxD, where D is the sum of the length of each transformer and N is the number of input molecules that have been successfully featurized.

Source code in molfeat/trans/concat.py
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
def transform(self, mols: List[Union[dm.Mol, str]], **kwargs):
    r"""
    Calls the ``FPVecTransformer.transform`` for each transformer in
    the current list, and concatenates the resulting fingerprints.

    Args:
        mols: List of SMILES or molecules
        kwargs: named parameters for transform (see below)

    Returns:
        fps: Computed fingerprints of size NxD, where D is the
            sum of the length of each transformer and N is the number of input
            molecules that have been successfully featurized.
    """

    fps = []
    for _, fp_trans in enumerate(self):
        out = fp_trans.transform(mols, enforce_dtype=False, **kwargs)
        out = datatype.cast(out, dtype="pandas")
        fps.append(out)
    fps = pd.concat(fps, axis=1)
    fps.columns = self.columns
    return fps.values