Skip to content

molfeat.trans.struct

ESM

ESMProteinFingerprint

Bases: MoleculeTransformer

ESM (Evolutionary Scale Modeling) protein representation embedding. ESM is a transformer protein language model introduced by Facebook FAIR in Rives et al., 2019: 'Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences'

Source code in molfeat/trans/struct/esm.py
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
class ESMProteinFingerprint(MoleculeTransformer):
    """
    ESM (Evolutionary Scale Modeling) protein representation embedding.
    ESM is a transformer protein language model introduced by Facebook FAIR in Rives et al., 2019:
    'Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences'
    """

    def __init__(
        self,
        featurizer: str = "esm1b_t33_650M_UR50S",
        loader_repo_or_dir: str = "facebookresearch/esm:main",
        device: Optional[str] = None,
        layers: List[int] = None,
        pooling: str = "mean",
        dtype: Callable = None,
        contact: bool = False,
        **kwargs,
    ):
        """Constructor for ESM protein representation

        Args:
            featurizer: Name of the ESM model to use. Defaults to "esm1b_t33_650M_UR50S".
            loader_repo_or_dir: Path to local dir containing the model or to a github repo. Default to "facebookresearch/esm:main
            device: Torch device to move the model to. Defaults to None.
            layers: Layers to use to extract information. Defaults to None, which is the last layers.
            pooling: Pooling method to use for sequence embedding. Defaults to "mean".
                If you set pooling to None, token representation will be returned (excluding BOS)
            dtype: Representation output datatype. Defaults to None.
            contact: Whether to return the predictied attention contact instead of the representation. Defaults to False.
        """
        self._model_name = featurizer
        self.device = device
        self.dtype = dtype
        self.featurizer = None
        self.alphabet = None
        self.batch_converter = None
        self._fitted = True
        self.cols_to_keep = None
        self.repr_layers = layers
        self.repo_or_dir = loader_repo_or_dir
        self.contact = contact
        max_layer_pattern = re.compile(".*_t([0-9]+)_.*")
        self._max_layers = int(max_layer_pattern.match(featurizer).group(1))
        if layers is None:
            self.repr_layers = [self._max_layers]
        if any(lay > self._max_layers for lay in self.repr_layers):
            raise ValueError(
                "You are requesting more layers than available for this pretrained model"
            )
        self._representation = "seq"
        self.pooling = Pooling(dim=0, name=pooling)
        if pooling is None:
            self._representation = "token"
        self._feat_length = None
        self._load_model()

    def _load_model(self):
        """Load model internally"""
        self.featurizer, self.alphabet = torch.hub.load(self.repo_or_dir, self._model_name)  # type: ignore
        self.batch_converter = self.alphabet.get_batch_converter()
        if self.device is not None:
            self.featurizer = self.featurizer.to(self.device)
        self.featurizer.eval()

    def __len__(self):
        """Get featurizer length"""
        if self._feat_length is None and not self.contact:
            embds = self._transform("MMMM")
            self._feat_length = embds.shape[-1]
        return self._feat_length

    @property
    def n_layers(self):
        """Number of layers used in the current embeddings"""
        return len(self.repr_layers)

    @torch.no_grad()
    def _embed(self, prot_seqs: List[str], prot_names: Optional[List[str]] = None, **kwargs):
        r"""
        Compute features for a single molecule.
        This method would potentially need to be reimplemented by child classes

        Args:
           prot_seqs: protein sequences as a sequence of amino acids
           prot_names: protein names

        Returns
            feat: list of N_SEQ representation, each of size (SEQ_LEN, FEAT_DIM * N_LAYERS) for token embeddings
                and (FEAT_DIM * N_LAYERS) for sequence embeddings. Note that SEQ_LEN will include the stop token.

        """
        if isinstance(prot_seqs, str):
            prot_seqs = [prot_seqs]
        if prot_names is None:
            prot_names = ["protein_{i}" for i in range(len(prot_seqs))]
        if isinstance(prot_names, str):
            prot_names = [prot_names]
        if len(prot_seqs) != len(prot_names):
            raise ValueError("Must provide the same number of protein sequence and label")
        data = list(zip(prot_names, prot_seqs))
        *_, batch_tokens = self.batch_converter(data)
        if self.device is not None:
            batch_tokens = batch_tokens.to(self.device)

        results = self.featurizer(
            batch_tokens, repr_layers=self.repr_layers, return_contacts=self.contact
        )
        embeddings = []
        if self.contact:
            for _, (seq, att_concats) in enumerate(zip(prot_seqs, results["contacts"])):
                embeddings.append(att_concats[: len(seq), : len(seq)])
        else:
            representation = torch.stack(
                [results["representations"][x] for x in self.repr_layers], dim=-1
            )
            if self._representation.startswith("seq"):
                for seq, token_rep in zip(prot_seqs, representation):
                    embeddings.append(
                        self.pooling(token_rep[1 : len(seq) + 1]).view(1, -1).squeeze(0)
                    )
            else:
                embeddings = list(
                    representation.view(representation.shape[0], representation.shape[1], -1)
                )
        return embeddings

    def __repr__(self):
        return "{}(model={}, pooling={}, dtype={})".format(
            self.__class__.__name__,
            _parse_to_evaluable_str(self._model_name),
            _parse_to_evaluable_str(self.pooling.name),
            _parse_to_evaluable_str(self.dtype),
        )

    def _transform(self, protein_seq: str, protein_name: str = None):
        """
        Transform a protein sequence into a feature vector.

        Args:
            protein: protein sequence as amino acid sequences
            protein_name: protein name

        Returns:
            Embedding of size (SEQ_LEN, FEAT_DIM, N_LAYERS) for token embeddings
                and (FEAT_DIM * N_LAYERS) for sequence embeddings
        """
        return self._embed(protein_seq, protein_name)[0]

    def transform(self, seqs: List[str], names: Optional[List[str]] = None, **kwargs):
        """
        Transform a list of protein sequence into a feature vector.

        Args:
            seqs: list of protein sequence as amino acids
            names: protein names

        Returns:
            Embedding of size (N_SEQS, SEQ_LEN, FEAT_DIM * N_LAYERS) for token embeddings
                and (N_SEQS, FEAT_DIM * N_LAYERS) for sequence embeddings. Use
        """
        if (
            names is None
            and isinstance(seqs, list)
            and isinstance(seqs[0], list)
            and len(seqs[0]) == 2
        ):
            names, seqs = zip(*seqs)
            seqs = list(seqs)
            names = list(names)
        return self._embed(seqs, names)

    def __call__(
        self,
        seqs: List[str],
        names: Optional[List[str]] = None,
        ignore_errors: bool = False,
        enforce_dtype: bool = True,
        **kwargs,
    ):
        r"""
        Compute molecular representation of a protein sequence.
        If ignore_error is True, a list of features and valid ids are returned.

        Args:
            seqs: list of protein sequence as amino acids
            names: protein names
            enforce_dtype: whether to enforce the instance dtype in the generated fingerprint
            ignore_errors: Whether to ignore errors during featurization or raise an error.
            kwargs: Named parameters for the transform method

        Returns:
            feats: list of valid embeddings
            ids: all valid positions that did not failed during featurization.
                Only returned when ignore_errors is True.

        """
        features = self.transform(seqs, names, ignore_errors=ignore_errors, **kwargs)
        ids = np.arange(len(features))
        if ignore_errors:
            features, ids = self._filter_none(features)
        if self.dtype is not None and enforce_dtype:
            if self.contact or not self._representation.startswith("seq"):
                features = [
                    datatype.cast(feat, dtype=self.dtype, columns=self.columns) for feat in features
                ]
            else:
                features = datatype.cast(features, dtype=self.dtype, columns=self.columns)
        if not ignore_errors:
            return features
        return features, ids

n_layers property

Number of layers used in the current embeddings

__call__(seqs, names=None, ignore_errors=False, enforce_dtype=True, **kwargs)

Compute molecular representation of a protein sequence. If ignore_error is True, a list of features and valid ids are returned.

Parameters:

Name Type Description Default
seqs List[str]

list of protein sequence as amino acids

required
names Optional[List[str]]

protein names

None
enforce_dtype bool

whether to enforce the instance dtype in the generated fingerprint

True
ignore_errors bool

Whether to ignore errors during featurization or raise an error.

False
kwargs

Named parameters for the transform method

{}

Returns:

Name Type Description
feats

list of valid embeddings

ids

all valid positions that did not failed during featurization. Only returned when ignore_errors is True.

Source code in molfeat/trans/struct/esm.py
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
def __call__(
    self,
    seqs: List[str],
    names: Optional[List[str]] = None,
    ignore_errors: bool = False,
    enforce_dtype: bool = True,
    **kwargs,
):
    r"""
    Compute molecular representation of a protein sequence.
    If ignore_error is True, a list of features and valid ids are returned.

    Args:
        seqs: list of protein sequence as amino acids
        names: protein names
        enforce_dtype: whether to enforce the instance dtype in the generated fingerprint
        ignore_errors: Whether to ignore errors during featurization or raise an error.
        kwargs: Named parameters for the transform method

    Returns:
        feats: list of valid embeddings
        ids: all valid positions that did not failed during featurization.
            Only returned when ignore_errors is True.

    """
    features = self.transform(seqs, names, ignore_errors=ignore_errors, **kwargs)
    ids = np.arange(len(features))
    if ignore_errors:
        features, ids = self._filter_none(features)
    if self.dtype is not None and enforce_dtype:
        if self.contact or not self._representation.startswith("seq"):
            features = [
                datatype.cast(feat, dtype=self.dtype, columns=self.columns) for feat in features
            ]
        else:
            features = datatype.cast(features, dtype=self.dtype, columns=self.columns)
    if not ignore_errors:
        return features
    return features, ids

__init__(featurizer='esm1b_t33_650M_UR50S', loader_repo_or_dir='facebookresearch/esm:main', device=None, layers=None, pooling='mean', dtype=None, contact=False, **kwargs)

Constructor for ESM protein representation

Parameters:

Name Type Description Default
featurizer str

Name of the ESM model to use. Defaults to "esm1b_t33_650M_UR50S".

'esm1b_t33_650M_UR50S'
loader_repo_or_dir str

Path to local dir containing the model or to a github repo. Default to "facebookresearch/esm:main

'facebookresearch/esm:main'
device Optional[str]

Torch device to move the model to. Defaults to None.

None
layers List[int]

Layers to use to extract information. Defaults to None, which is the last layers.

None
pooling str

Pooling method to use for sequence embedding. Defaults to "mean". If you set pooling to None, token representation will be returned (excluding BOS)

'mean'
dtype Callable

Representation output datatype. Defaults to None.

None
contact bool

Whether to return the predictied attention contact instead of the representation. Defaults to False.

False
Source code in molfeat/trans/struct/esm.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
def __init__(
    self,
    featurizer: str = "esm1b_t33_650M_UR50S",
    loader_repo_or_dir: str = "facebookresearch/esm:main",
    device: Optional[str] = None,
    layers: List[int] = None,
    pooling: str = "mean",
    dtype: Callable = None,
    contact: bool = False,
    **kwargs,
):
    """Constructor for ESM protein representation

    Args:
        featurizer: Name of the ESM model to use. Defaults to "esm1b_t33_650M_UR50S".
        loader_repo_or_dir: Path to local dir containing the model or to a github repo. Default to "facebookresearch/esm:main
        device: Torch device to move the model to. Defaults to None.
        layers: Layers to use to extract information. Defaults to None, which is the last layers.
        pooling: Pooling method to use for sequence embedding. Defaults to "mean".
            If you set pooling to None, token representation will be returned (excluding BOS)
        dtype: Representation output datatype. Defaults to None.
        contact: Whether to return the predictied attention contact instead of the representation. Defaults to False.
    """
    self._model_name = featurizer
    self.device = device
    self.dtype = dtype
    self.featurizer = None
    self.alphabet = None
    self.batch_converter = None
    self._fitted = True
    self.cols_to_keep = None
    self.repr_layers = layers
    self.repo_or_dir = loader_repo_or_dir
    self.contact = contact
    max_layer_pattern = re.compile(".*_t([0-9]+)_.*")
    self._max_layers = int(max_layer_pattern.match(featurizer).group(1))
    if layers is None:
        self.repr_layers = [self._max_layers]
    if any(lay > self._max_layers for lay in self.repr_layers):
        raise ValueError(
            "You are requesting more layers than available for this pretrained model"
        )
    self._representation = "seq"
    self.pooling = Pooling(dim=0, name=pooling)
    if pooling is None:
        self._representation = "token"
    self._feat_length = None
    self._load_model()

__len__()

Get featurizer length

Source code in molfeat/trans/struct/esm.py
79
80
81
82
83
84
def __len__(self):
    """Get featurizer length"""
    if self._feat_length is None and not self.contact:
        embds = self._transform("MMMM")
        self._feat_length = embds.shape[-1]
    return self._feat_length

transform(seqs, names=None, **kwargs)

Transform a list of protein sequence into a feature vector.

Parameters:

Name Type Description Default
seqs List[str]

list of protein sequence as amino acids

required
names Optional[List[str]]

protein names

None

Returns:

Type Description

Embedding of size (N_SEQS, SEQ_LEN, FEAT_DIM * N_LAYERS) for token embeddings and (N_SEQS, FEAT_DIM * N_LAYERS) for sequence embeddings. Use

Source code in molfeat/trans/struct/esm.py
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
def transform(self, seqs: List[str], names: Optional[List[str]] = None, **kwargs):
    """
    Transform a list of protein sequence into a feature vector.

    Args:
        seqs: list of protein sequence as amino acids
        names: protein names

    Returns:
        Embedding of size (N_SEQS, SEQ_LEN, FEAT_DIM * N_LAYERS) for token embeddings
            and (N_SEQS, FEAT_DIM * N_LAYERS) for sequence embeddings. Use
    """
    if (
        names is None
        and isinstance(seqs, list)
        and isinstance(seqs[0], list)
        and len(seqs[0]) == 2
    ):
        names, seqs = zip(*seqs)
        seqs = list(seqs)
        names = list(names)
    return self._embed(seqs, names)

Bio Embeddings

ProtBioFingerprint

Bases: MoleculeTransformer

Wrapper for general purpose biological sequence representations, as provided by bio_embeddings

For a list of available embeddings, see: https://docs.bioembeddings.com/v0.2.2/api/bio_embeddings.embed.html

!!! note: The embeddings proposed here are the general purpose embeddings, meaning that task-specific embeddings offered by bio_embeddings (e.g PBTucker, DeepBlast) are not included.

According to the bio_embeddings documentation, `prottrans_bert_bfd` and `seqvec` are the best embeddings.
Source code in molfeat/trans/struct/prot1D.py
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
class ProtBioFingerprint(MoleculeTransformer):
    """
    Wrapper for general purpose biological sequence representations, as provided by [`bio_embeddings`](https://github.com/sacdallago/bio_embeddings)

    For a list of available embeddings, see: https://docs.bioembeddings.com/v0.2.2/api/bio_embeddings.embed.html

    !!! note:
        The embeddings proposed here are the general purpose embeddings, meaning that task-specific
        embeddings offered by `bio_embeddings` (e.g PBTucker, DeepBlast) are not included.

        According to the bio_embeddings documentation, `prottrans_bert_bfd` and `seqvec` are the best embeddings.
    """

    SUPPORTED_EMBEDDINGS = [
        "bepler",
        "cpcprot",
        "esm",
        "esm1b",
        "esm1v",
        "fasttext",
        "glove",
        "one_hot_encoding",
        "plus_rnn",
        "prottrans_albert_bfd",
        "prottrans_bert_bfd",
        "prottrans_t5_bfd",
        "prottrans_t5_uniref50",
        "prottrans_t5_xl_u50",
        "prottrans_xlnet_uniref100",
        "seqvec",
        "unirep",
        "word2vec",
    ]

    def __init__(
        self,
        featurizer: Union[str, Callable] = "seqvec",
        pooling: str = "mean",
        dtype: Callable = np.float32,
        device: Optional[Union[torch.device, str]] = None,
        layer_pooling: str = "sum",
        **kwargs,
    ):
        """Constructor for Deep Learning based Protein representation.
        SeqVec featurizer will e

        Args:
            featurizer: Name of callable of the embedding model
            pooling: Pooling method to use for sequence embedding. Defaults to "mean".
                If you set pooling to None, token representation will be returned
            dtype: Representation output datatype. Defaults to None.
            device: Torch device to move the model to. Defaults to None.
            layer_pooling: Layer-wise pooling method to use when > 1 layer exists. Default to 'sum'.
                If None, last layers is taken. This is relevant for `seqvec` mostly
        """
        if not requires.check("bio_embeddings"):
            raise ValueError(
                "Cannot use this featurizer without bio_embeddings (pip install 'bio_embeddings[all]')."
            )

        if isinstance(featurizer, bio_embedder.EmbedderInterface):
            featurizer = featurizer
            self._model_name = self.featurizer.name
        else:
            if (
                not isinstance(featurizer, str)
                or featurizer.lower() not in self.SUPPORTED_EMBEDDINGS
            ):
                raise ValueError("Unknown featurizer: {}".format(featurizer))
            self._model_name = featurizer.lower()
            featurizer = bio_embedder.name_to_embedder[self._model_name](device=device, **kwargs)

        super().__init__(featurizer=featurizer, dtype=dtype, **kwargs)
        self._fitted = True
        self._representation = "seq"
        self.pooling = Pooling(dim=0, name=pooling)
        self.layer_pooling = Pooling(dim=0, name=layer_pooling)
        if pooling is None:
            self._representation = "token"
        self._feat_length = None

    def __len__(self):
        """Get featurizer length"""
        return self.featurizer.embedding_dimension

    @property
    def n_layers(self):
        """Get the number of layers used in this embedding"""
        return self.featurizer.number_of_layers

    def __repr__(self):
        return "{}(model={}, pooling={}, dtype={})".format(
            self.__class__.__name__,
            _parse_to_evaluable_str(self._model_name),
            _parse_to_evaluable_str(self.pooling.name),
            _parse_to_evaluable_str(self.dtype),
        )

    def _pool(self, embedding: list):
        """Perform embedding pooling
        Args:
            embedding: input embedding
        """
        if self.n_layers > 1 and self.layer_pooling.name is not None:
            embedding = self.layer_pooling(embedding)
        if len(embedding.shape) > 2:
            # we forcefully take the last layers
            embedding = embedding[-1]
        return self.pooling(embedding)

    def _transform(
        self,
        protein_seq: str,
        **kwargs,
    ):
        """
        Transform a protein/nucleotide sequence into a feature vector.

        Args:
            protein: protein sequence as amino acid sequences

        Returns:
            Embedding of size (FEAT_DIM, N_LAYERS) for token embeddings
                and (FEAT_DIM, N_LAYERS) for sequence embeddings
        """

        rep = self.featurizer.embed(protein_seq)
        return self._pool(rep)

    def transform(self, seqs: List[str], names: Optional[List[str]] = None, **kwargs):
        """
        Transform a list of protein/nucleotide sequence into a feature vector.

        Args:
            seqs: list of protein/nucleotide sequence as amino acids
            names: names of the macromolecules.  Will be ignored
            kwargs: additional arguments for the featurizer

        Returns:
            Embedding of size (N_SEQS, FEAT_DIM) for token embeddings
                and (FEAT_DIM, N_LAYERS) for sequence embeddings
        """
        if not isinstance(seqs, list):
            seqs = [seqs]
        if isinstance(seqs[0], (list, tuple)) and len(seqs[0]) == 2:
            _, seqs = zip(*seqs)
            seqs = list(seqs)
        res = list(self.featurizer.embed_many(seqs, **kwargs))
        res = [self._pool(x) for x in res]
        return res

    def __call__(
        self,
        seqs: List[str],
        ignore_errors: bool = False,
        enforce_dtype: bool = True,
        **kwargs,
    ):
        r"""
        Compute molecular representation of a protein sequence.
        If ignore_error is True, a list of features and valid ids are returned.

        Args:
            seqs: list of protein or nucleotide sequence as amino acids
            enforce_dtype: whether to enforce the instance dtype in the generated fingerprint
            ignore_errors: Whether to ignore errors during featurization or raise an error.
            kwargs: Named parameters for the transform method

        Returns:
            feats: list of valid embeddings
            ids: all valid positions that did not failed during featurization.
                Only returned when ignore_errors is True.

        """
        features = self.transform(seqs, **kwargs)
        ids = np.arange(len(features))
        if ignore_errors:
            features, ids = self._filter_none(features)
        if self.dtype is not None and enforce_dtype:
            if self._representation.startswith("token"):
                features = [
                    datatype.cast(feat, dtype=self.dtype, columns=self.columns) for feat in features
                ]
            else:
                features = datatype.cast(features, dtype=self.dtype, columns=self.columns)
        if not ignore_errors:
            return features
        return features, ids

n_layers property

Get the number of layers used in this embedding

__call__(seqs, ignore_errors=False, enforce_dtype=True, **kwargs)

Compute molecular representation of a protein sequence. If ignore_error is True, a list of features and valid ids are returned.

Parameters:

Name Type Description Default
seqs List[str]

list of protein or nucleotide sequence as amino acids

required
enforce_dtype bool

whether to enforce the instance dtype in the generated fingerprint

True
ignore_errors bool

Whether to ignore errors during featurization or raise an error.

False
kwargs

Named parameters for the transform method

{}

Returns:

Name Type Description
feats

list of valid embeddings

ids

all valid positions that did not failed during featurization. Only returned when ignore_errors is True.

Source code in molfeat/trans/struct/prot1D.py
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
def __call__(
    self,
    seqs: List[str],
    ignore_errors: bool = False,
    enforce_dtype: bool = True,
    **kwargs,
):
    r"""
    Compute molecular representation of a protein sequence.
    If ignore_error is True, a list of features and valid ids are returned.

    Args:
        seqs: list of protein or nucleotide sequence as amino acids
        enforce_dtype: whether to enforce the instance dtype in the generated fingerprint
        ignore_errors: Whether to ignore errors during featurization or raise an error.
        kwargs: Named parameters for the transform method

    Returns:
        feats: list of valid embeddings
        ids: all valid positions that did not failed during featurization.
            Only returned when ignore_errors is True.

    """
    features = self.transform(seqs, **kwargs)
    ids = np.arange(len(features))
    if ignore_errors:
        features, ids = self._filter_none(features)
    if self.dtype is not None and enforce_dtype:
        if self._representation.startswith("token"):
            features = [
                datatype.cast(feat, dtype=self.dtype, columns=self.columns) for feat in features
            ]
        else:
            features = datatype.cast(features, dtype=self.dtype, columns=self.columns)
    if not ignore_errors:
        return features
    return features, ids

__init__(featurizer='seqvec', pooling='mean', dtype=np.float32, device=None, layer_pooling='sum', **kwargs)

Constructor for Deep Learning based Protein representation. SeqVec featurizer will e

Parameters:

Name Type Description Default
featurizer Union[str, Callable]

Name of callable of the embedding model

'seqvec'
pooling str

Pooling method to use for sequence embedding. Defaults to "mean". If you set pooling to None, token representation will be returned

'mean'
dtype Callable

Representation output datatype. Defaults to None.

float32
device Optional[Union[device, str]]

Torch device to move the model to. Defaults to None.

None
layer_pooling str

Layer-wise pooling method to use when > 1 layer exists. Default to 'sum'. If None, last layers is taken. This is relevant for seqvec mostly

'sum'
Source code in molfeat/trans/struct/prot1D.py
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
def __init__(
    self,
    featurizer: Union[str, Callable] = "seqvec",
    pooling: str = "mean",
    dtype: Callable = np.float32,
    device: Optional[Union[torch.device, str]] = None,
    layer_pooling: str = "sum",
    **kwargs,
):
    """Constructor for Deep Learning based Protein representation.
    SeqVec featurizer will e

    Args:
        featurizer: Name of callable of the embedding model
        pooling: Pooling method to use for sequence embedding. Defaults to "mean".
            If you set pooling to None, token representation will be returned
        dtype: Representation output datatype. Defaults to None.
        device: Torch device to move the model to. Defaults to None.
        layer_pooling: Layer-wise pooling method to use when > 1 layer exists. Default to 'sum'.
            If None, last layers is taken. This is relevant for `seqvec` mostly
    """
    if not requires.check("bio_embeddings"):
        raise ValueError(
            "Cannot use this featurizer without bio_embeddings (pip install 'bio_embeddings[all]')."
        )

    if isinstance(featurizer, bio_embedder.EmbedderInterface):
        featurizer = featurizer
        self._model_name = self.featurizer.name
    else:
        if (
            not isinstance(featurizer, str)
            or featurizer.lower() not in self.SUPPORTED_EMBEDDINGS
        ):
            raise ValueError("Unknown featurizer: {}".format(featurizer))
        self._model_name = featurizer.lower()
        featurizer = bio_embedder.name_to_embedder[self._model_name](device=device, **kwargs)

    super().__init__(featurizer=featurizer, dtype=dtype, **kwargs)
    self._fitted = True
    self._representation = "seq"
    self.pooling = Pooling(dim=0, name=pooling)
    self.layer_pooling = Pooling(dim=0, name=layer_pooling)
    if pooling is None:
        self._representation = "token"
    self._feat_length = None

__len__()

Get featurizer length

Source code in molfeat/trans/struct/prot1D.py
 99
100
101
def __len__(self):
    """Get featurizer length"""
    return self.featurizer.embedding_dimension

transform(seqs, names=None, **kwargs)

Transform a list of protein/nucleotide sequence into a feature vector.

Parameters:

Name Type Description Default
seqs List[str]

list of protein/nucleotide sequence as amino acids

required
names Optional[List[str]]

names of the macromolecules. Will be ignored

None
kwargs

additional arguments for the featurizer

{}

Returns:

Type Description

Embedding of size (N_SEQS, FEAT_DIM) for token embeddings and (FEAT_DIM, N_LAYERS) for sequence embeddings

Source code in molfeat/trans/struct/prot1D.py
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
def transform(self, seqs: List[str], names: Optional[List[str]] = None, **kwargs):
    """
    Transform a list of protein/nucleotide sequence into a feature vector.

    Args:
        seqs: list of protein/nucleotide sequence as amino acids
        names: names of the macromolecules.  Will be ignored
        kwargs: additional arguments for the featurizer

    Returns:
        Embedding of size (N_SEQS, FEAT_DIM) for token embeddings
            and (FEAT_DIM, N_LAYERS) for sequence embeddings
    """
    if not isinstance(seqs, list):
        seqs = [seqs]
    if isinstance(seqs[0], (list, tuple)) and len(seqs[0]) == 2:
        _, seqs = zip(*seqs)
        seqs = list(seqs)
    res = list(self.featurizer.embed_many(seqs, **kwargs))
    res = [self._pool(x) for x in res]
    return res