molfeat.trans.struct

ESM¶

`ESMProteinFingerprint` ¶

Bases: MoleculeTransformer

ESM (Evolutionary Scale Modeling) protein representation embedding. ESM is a transformer protein language model introduced by Facebook FAIR in Rives et al., 2019: 'Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences'

Source code in molfeat/trans/struct/esm.py

class ESMProteinFingerprint(MoleculeTransformer):
    """
    ESM (Evolutionary Scale Modeling) protein representation embedding.
    ESM is a transformer protein language model introduced by Facebook FAIR in Rives et al., 2019:
    'Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences'
    """

    def __init__(
        self,
        featurizer: str = "esm1b_t33_650M_UR50S",
        loader_repo_or_dir: str = "facebookresearch/esm:main",
        device: Optional[str] = None,
        layers: List[int] = None,
        pooling: str = "mean",
        dtype: Callable = None,
        contact: bool = False,
        **kwargs,
    ):
        """Constructor for ESM protein representation

        Args:
            featurizer: Name of the ESM model to use. Defaults to "esm1b_t33_650M_UR50S".
            loader_repo_or_dir: Path to local dir containing the model or to a github repo. Default to "facebookresearch/esm:main
            device: Torch device to move the model to. Defaults to None.
            layers: Layers to use to extract information. Defaults to None, which is the last layers.
            pooling: Pooling method to use for sequence embedding. Defaults to "mean".
                If you set pooling to None, token representation will be returned (excluding BOS)
            dtype: Representation output datatype. Defaults to None.
            contact: Whether to return the predictied attention contact instead of the representation. Defaults to False.
        """
        self._model_name = featurizer
        self.device = device
        self.dtype = dtype
        self.featurizer = None
        self.alphabet = None
        self.batch_converter = None
        self._fitted = True
        self.cols_to_keep = None
        self.repr_layers = layers
        self.repo_or_dir = loader_repo_or_dir
        self.contact = contact
        max_layer_pattern = re.compile(".*_t([0-9]+)_.*")
        self._max_layers = int(max_layer_pattern.match(featurizer).group(1))
        if layers is None:
            self.repr_layers = [self._max_layers]
        if any(lay > self._max_layers for lay in self.repr_layers):
            raise ValueError(
                "You are requesting more layers than available for this pretrained model"
            )
        self._representation = "seq"
        self.pooling = Pooling(dim=0, name=pooling)
        if pooling is None:
            self._representation = "token"
        self._feat_length = None
        self._load_model()

    def _load_model(self):
        """Load model internally"""
        self.featurizer, self.alphabet = torch.hub.load(self.repo_or_dir, self._model_name)  # type: ignore
        self.batch_converter = self.alphabet.get_batch_converter()
        if self.device is not None:
            self.featurizer = self.featurizer.to(self.device)
        self.featurizer.eval()

    def __len__(self):
        """Get featurizer length"""
        if self._feat_length is None and not self.contact:
            embds = self._transform("MMMM")
            self._feat_length = embds.shape[-1]
        return self._feat_length

    @property
    def n_layers(self):
        """Number of layers used in the current embeddings"""
        return len(self.repr_layers)

    @torch.no_grad()
    def _embed(self, prot_seqs: List[str], prot_names: Optional[List[str]] = None, **kwargs):
        r"""
        Compute features for a single molecule.
        This method would potentially need to be reimplemented by child classes

        Args:
           prot_seqs: protein sequences as a sequence of amino acids
           prot_names: protein names

        Returns
            feat: list of N_SEQ representation, each of size (SEQ_LEN, FEAT_DIM * N_LAYERS) for token embeddings
                and (FEAT_DIM * N_LAYERS) for sequence embeddings. Note that SEQ_LEN will include the stop token.

        """
        if isinstance(prot_seqs, str):
            prot_seqs = [prot_seqs]
        if prot_names is None:
            prot_names = ["protein_{i}" for i in range(len(prot_seqs))]
        if isinstance(prot_names, str):
            prot_names = [prot_names]
        if len(prot_seqs) != len(prot_names):
            raise ValueError("Must provide the same number of protein sequence and label")
        data = list(zip(prot_names, prot_seqs))
        *_, batch_tokens = self.batch_converter(data)
        if self.device is not None:
            batch_tokens = batch_tokens.to(self.device)

        results = self.featurizer(
            batch_tokens, repr_layers=self.repr_layers, return_contacts=self.contact
        )
        embeddings = []
        if self.contact:
            for _, (seq, att_concats) in enumerate(zip(prot_seqs, results["contacts"])):
                embeddings.append(att_concats[: len(seq), : len(seq)])
        else:
            representation = torch.stack(
                [results["representations"][x] for x in self.repr_layers], dim=-1
            )
            if self._representation.startswith("seq"):
                for seq, token_rep in zip(prot_seqs, representation):
                    embeddings.append(
                        self.pooling(token_rep[1 : len(seq) + 1]).view(1, -1).squeeze(0)
                    )
            else:
                embeddings = list(
                    representation.view(representation.shape[0], representation.shape[1], -1)
                )
        return embeddings

    def __repr__(self):
        return "{}(model={}, pooling={}, dtype={})".format(
            self.__class__.__name__,
            _parse_to_evaluable_str(self._model_name),
            _parse_to_evaluable_str(self.pooling.name),
            _parse_to_evaluable_str(self.dtype),
        )

    def _transform(self, protein_seq: str, protein_name: str = None):
        """
        Transform a protein sequence into a feature vector.

        Args:
            protein: protein sequence as amino acid sequences
            protein_name: protein name

        Returns:
            Embedding of size (SEQ_LEN, FEAT_DIM, N_LAYERS) for token embeddings
                and (FEAT_DIM * N_LAYERS) for sequence embeddings
        """
        return self._embed(protein_seq, protein_name)[0]

    def transform(self, seqs: List[str], names: Optional[List[str]] = None, **kwargs):
        """
        Transform a list of protein sequence into a feature vector.

        Args:
            seqs: list of protein sequence as amino acids
            names: protein names

        Returns:
            Embedding of size (N_SEQS, SEQ_LEN, FEAT_DIM * N_LAYERS) for token embeddings
                and (N_SEQS, FEAT_DIM * N_LAYERS) for sequence embeddings. Use
        """
        if (
            names is None
            and isinstance(seqs, list)
            and isinstance(seqs[0], list)
            and len(seqs[0]) == 2
        ):
            names, seqs = zip(*seqs)
            seqs = list(seqs)
            names = list(names)
        return self._embed(seqs, names)

    def __call__(
        self,
        seqs: List[str],
        names: Optional[List[str]] = None,
        ignore_errors: bool = False,
        enforce_dtype: bool = True,
        **kwargs,
    ):
        r"""
        Compute molecular representation of a protein sequence.
        If ignore_error is True, a list of features and valid ids are returned.

        Args:
            seqs: list of protein sequence as amino acids
            names: protein names
            enforce_dtype: whether to enforce the instance dtype in the generated fingerprint
            ignore_errors: Whether to ignore errors during featurization or raise an error.
            kwargs: Named parameters for the transform method

        Returns:
            feats: list of valid embeddings
            ids: all valid positions that did not failed during featurization.
                Only returned when ignore_errors is True.

        """
        features = self.transform(seqs, names, ignore_errors=ignore_errors, **kwargs)
        ids = np.arange(len(features))
        if ignore_errors:
            features, ids = self._filter_none(features)
        if self.dtype is not None and enforce_dtype:
            if self.contact or not self._representation.startswith("seq"):
                features = [
                    datatype.cast(feat, dtype=self.dtype, columns=self.columns) for feat in features
                ]
            else:
                features = datatype.cast(features, dtype=self.dtype, columns=self.columns)
        if not ignore_errors:
            return features
        return features, ids

`device = device` `instance-attribute` ¶

`dtype = dtype` `instance-attribute` ¶

`featurizer = None` `instance-attribute` ¶

`alphabet = None` `instance-attribute` ¶

`batch_converter = None` `instance-attribute` ¶

`cols_to_keep = None` `instance-attribute` ¶

`repr_layers = layers` `instance-attribute` ¶

`repo_or_dir = loader_repo_or_dir` `instance-attribute` ¶

`contact = contact` `instance-attribute` ¶

`pooling = Pooling(dim=0, name=pooling)` `instance-attribute` ¶

`n_layers` `property` ¶

Number of layers used in the current embeddings

`init(featurizer: str = 'esm1b_t33_650M_UR50S', loader_repo_or_dir: str = 'facebookresearch/esm:main', device: Optional[str] = None, layers: List[int] = None, pooling: str = 'mean', dtype: Callable = None, contact: bool = False, **kwargs)` ¶

Constructor for ESM protein representation

Parameters:

Name	Type	Description	Default
`featurizer`	`str`	Name of the ESM model to use. Defaults to "esm1b_t33_650M_UR50S".	`'esm1b_t33_650M_UR50S'`
`loader_repo_or_dir`	`str`	Path to local dir containing the model or to a github repo. Default to "facebookresearch/esm:main	`'facebookresearch/esm:main'`
`device`	`Optional[str]`	Torch device to move the model to. Defaults to None.	`None`
`layers`	`List[int]`	Layers to use to extract information. Defaults to None, which is the last layers.	`None`
`pooling`	`str`	Pooling method to use for sequence embedding. Defaults to "mean". If you set pooling to None, token representation will be returned (excluding BOS)	`'mean'`
`dtype`	`Callable`	Representation output datatype. Defaults to None.	`None`
`contact`	`bool`	Whether to return the predictied attention contact instead of the representation. Defaults to False.	`False`

Source code in molfeat/trans/struct/esm.py

def __init__(
    self,
    featurizer: str = "esm1b_t33_650M_UR50S",
    loader_repo_or_dir: str = "facebookresearch/esm:main",
    device: Optional[str] = None,
    layers: List[int] = None,
    pooling: str = "mean",
    dtype: Callable = None,
    contact: bool = False,
    **kwargs,
):
    """Constructor for ESM protein representation

    Args:
        featurizer: Name of the ESM model to use. Defaults to "esm1b_t33_650M_UR50S".
        loader_repo_or_dir: Path to local dir containing the model or to a github repo. Default to "facebookresearch/esm:main
        device: Torch device to move the model to. Defaults to None.
        layers: Layers to use to extract information. Defaults to None, which is the last layers.
        pooling: Pooling method to use for sequence embedding. Defaults to "mean".
            If you set pooling to None, token representation will be returned (excluding BOS)
        dtype: Representation output datatype. Defaults to None.
        contact: Whether to return the predictied attention contact instead of the representation. Defaults to False.
    """
    self._model_name = featurizer
    self.device = device
    self.dtype = dtype
    self.featurizer = None
    self.alphabet = None
    self.batch_converter = None
    self._fitted = True
    self.cols_to_keep = None
    self.repr_layers = layers
    self.repo_or_dir = loader_repo_or_dir
    self.contact = contact
    max_layer_pattern = re.compile(".*_t([0-9]+)_.*")
    self._max_layers = int(max_layer_pattern.match(featurizer).group(1))
    if layers is None:
        self.repr_layers = [self._max_layers]
    if any(lay > self._max_layers for lay in self.repr_layers):
        raise ValueError(
            "You are requesting more layers than available for this pretrained model"
        )
    self._representation = "seq"
    self.pooling = Pooling(dim=0, name=pooling)
    if pooling is None:
        self._representation = "token"
    self._feat_length = None
    self._load_model()

`len()` ¶

Get featurizer length

Source code in molfeat/trans/struct/esm.py

def __len__(self):
    """Get featurizer length"""
    if self._feat_length is None and not self.contact:
        embds = self._transform("MMMM")
        self._feat_length = embds.shape[-1]
    return self._feat_length

`repr()` ¶

Source code in molfeat/trans/struct/esm.py

def __repr__(self):
    return "{}(model={}, pooling={}, dtype={})".format(
        self.__class__.__name__,
        _parse_to_evaluable_str(self._model_name),
        _parse_to_evaluable_str(self.pooling.name),
        _parse_to_evaluable_str(self.dtype),
    )

`transform(seqs: List[str], names: Optional[List[str]] = None, **kwargs)` ¶

Transform a list of protein sequence into a feature vector.

Parameters:

Name	Type	Description	Default
`seqs`	`List[str]`	list of protein sequence as amino acids	required
`names`	`Optional[List[str]]`	protein names	`None`

Returns:

Type	Description
	Embedding of size (N_SEQS, SEQ_LEN, FEAT_DIM * N_LAYERS) for token embeddings and (N_SEQS, FEAT_DIM * N_LAYERS) for sequence embeddings. Use

Source code in molfeat/trans/struct/esm.py

def transform(self, seqs: List[str], names: Optional[List[str]] = None, **kwargs):
    """
    Transform a list of protein sequence into a feature vector.

    Args:
        seqs: list of protein sequence as amino acids
        names: protein names

    Returns:
        Embedding of size (N_SEQS, SEQ_LEN, FEAT_DIM * N_LAYERS) for token embeddings
            and (N_SEQS, FEAT_DIM * N_LAYERS) for sequence embeddings. Use
    """
    if (
        names is None
        and isinstance(seqs, list)
        and isinstance(seqs[0], list)
        and len(seqs[0]) == 2
    ):
        names, seqs = zip(*seqs)
        seqs = list(seqs)
        names = list(names)
    return self._embed(seqs, names)

`call(seqs: List[str], names: Optional[List[str]] = None, ignore_errors: bool = False, enforce_dtype: bool = True, **kwargs)` ¶

Compute molecular representation of a protein sequence. If ignore_error is True, a list of features and valid ids are returned.

Parameters:

Name	Type	Description	Default
`seqs`	`List[str]`	list of protein sequence as amino acids	required
`names`	`Optional[List[str]]`	protein names	`None`
`enforce_dtype`	`bool`	whether to enforce the instance dtype in the generated fingerprint	`True`
`ignore_errors`	`bool`	Whether to ignore errors during featurization or raise an error.	`False`
`kwargs`		Named parameters for the transform method	`{}`

Returns:

Name	Type	Description
`feats`		list of valid embeddings
`ids`		all valid positions that did not failed during featurization. Only returned when ignore_errors is True.

Source code in molfeat/trans/struct/esm.py

def __call__(
    self,
    seqs: List[str],
    names: Optional[List[str]] = None,
    ignore_errors: bool = False,
    enforce_dtype: bool = True,
    **kwargs,
):
    r"""
    Compute molecular representation of a protein sequence.
    If ignore_error is True, a list of features and valid ids are returned.

    Args:
        seqs: list of protein sequence as amino acids
        names: protein names
        enforce_dtype: whether to enforce the instance dtype in the generated fingerprint
        ignore_errors: Whether to ignore errors during featurization or raise an error.
        kwargs: Named parameters for the transform method

    Returns:
        feats: list of valid embeddings
        ids: all valid positions that did not failed during featurization.
            Only returned when ignore_errors is True.

    """
    features = self.transform(seqs, names, ignore_errors=ignore_errors, **kwargs)
    ids = np.arange(len(features))
    if ignore_errors:
        features, ids = self._filter_none(features)
    if self.dtype is not None and enforce_dtype:
        if self.contact or not self._representation.startswith("seq"):
            features = [
                datatype.cast(feat, dtype=self.dtype, columns=self.columns) for feat in features
            ]
        else:
            features = datatype.cast(features, dtype=self.dtype, columns=self.columns)
    if not ignore_errors:
        return features
    return features, ids

Bio Embeddings¶

`ProtBioFingerprint` ¶

Bases: MoleculeTransformer

Wrapper for general purpose biological sequence representations, as provided by bio_embeddings

For a list of available embeddings, see: https://docs.bioembeddings.com/v0.2.2/api/bio_embeddings.embed.html

!!! note: The embeddings proposed here are the general purpose embeddings, meaning that task-specific embeddings offered by bio_embeddings (e.g PBTucker, DeepBlast) are not included.

According to the bio_embeddings documentation, `prottrans_bert_bfd` and `seqvec` are the best embeddings.

Source code in molfeat/trans/struct/prot1D.py

class ProtBioFingerprint(MoleculeTransformer):
    """
    Wrapper for general purpose biological sequence representations, as provided by [`bio_embeddings`](https://github.com/sacdallago/bio_embeddings)

    For a list of available embeddings, see: https://docs.bioembeddings.com/v0.2.2/api/bio_embeddings.embed.html

    !!! note:
        The embeddings proposed here are the general purpose embeddings, meaning that task-specific
        embeddings offered by `bio_embeddings` (e.g PBTucker, DeepBlast) are not included.

        According to the bio_embeddings documentation, `prottrans_bert_bfd` and `seqvec` are the best embeddings.
    """

    SUPPORTED_EMBEDDINGS = [
        "bepler",
        "cpcprot",
        "esm",
        "esm1b",
        "esm1v",
        "fasttext",
        "glove",
        "one_hot_encoding",
        "plus_rnn",
        "prottrans_albert_bfd",
        "prottrans_bert_bfd",
        "prottrans_t5_bfd",
        "prottrans_t5_uniref50",
        "prottrans_t5_xl_u50",
        "prottrans_xlnet_uniref100",
        "seqvec",
        "unirep",
        "word2vec",
    ]

    def __init__(
        self,
        featurizer: Union[str, Callable] = "seqvec",
        pooling: str = "mean",
        dtype: Callable = np.float32,
        device: Optional[Union[torch.device, str]] = None,
        layer_pooling: str = "sum",
        **kwargs,
    ):
        """Constructor for Deep Learning based Protein representation.
        SeqVec featurizer will e

        Args:
            featurizer: Name of callable of the embedding model
            pooling: Pooling method to use for sequence embedding. Defaults to "mean".
                If you set pooling to None, token representation will be returned
            dtype: Representation output datatype. Defaults to None.
            device: Torch device to move the model to. Defaults to None.
            layer_pooling: Layer-wise pooling method to use when > 1 layer exists. Default to 'sum'.
                If None, last layers is taken. This is relevant for `seqvec` mostly
        """
        if not requires.check("bio_embeddings"):
            raise ValueError(
                "Cannot use this featurizer without bio_embeddings (pip install 'bio_embeddings[all]')."
            )

        if isinstance(featurizer, bio_embedder.EmbedderInterface):
            featurizer = featurizer
            self._model_name = self.featurizer.name
        else:
            if (
                not isinstance(featurizer, str)
                or featurizer.lower() not in self.SUPPORTED_EMBEDDINGS
            ):
                raise ValueError("Unknown featurizer: {}".format(featurizer))
            self._model_name = featurizer.lower()
            featurizer = bio_embedder.name_to_embedder[self._model_name](device=device, **kwargs)

        super().__init__(featurizer=featurizer, dtype=dtype, **kwargs)
        self._fitted = True
        self._representation = "seq"
        self.pooling = Pooling(dim=0, name=pooling)
        self.layer_pooling = Pooling(dim=0, name=layer_pooling)
        if pooling is None:
            self._representation = "token"
        self._feat_length = None

    def __len__(self):
        """Get featurizer length"""
        return self.featurizer.embedding_dimension

    @property
    def n_layers(self):
        """Get the number of layers used in this embedding"""
        return self.featurizer.number_of_layers

    def __repr__(self):
        return "{}(model={}, pooling={}, dtype={})".format(
            self.__class__.__name__,
            _parse_to_evaluable_str(self._model_name),
            _parse_to_evaluable_str(self.pooling.name),
            _parse_to_evaluable_str(self.dtype),
        )

    def _pool(self, embedding: list):
        """Perform embedding pooling
        Args:
            embedding: input embedding
        """
        if self.n_layers > 1 and self.layer_pooling.name is not None:
            embedding = self.layer_pooling(embedding)
        if len(embedding.shape) > 2:
            # we forcefully take the last layers
            embedding = embedding[-1]
        return self.pooling(embedding)

    def _transform(
        self,
        protein_seq: str,
        **kwargs,
    ):
        """
        Transform a protein/nucleotide sequence into a feature vector.

        Args:
            protein: protein sequence as amino acid sequences

        Returns:
            Embedding of size (FEAT_DIM, N_LAYERS) for token embeddings
                and (FEAT_DIM, N_LAYERS) for sequence embeddings
        """

        rep = self.featurizer.embed(protein_seq)
        return self._pool(rep)

    def transform(self, seqs: List[str], names: Optional[List[str]] = None, **kwargs):
        """
        Transform a list of protein/nucleotide sequence into a feature vector.

        Args:
            seqs: list of protein/nucleotide sequence as amino acids
            names: names of the macromolecules.  Will be ignored
            kwargs: additional arguments for the featurizer

        Returns:
            Embedding of size (N_SEQS, FEAT_DIM) for token embeddings
                and (FEAT_DIM, N_LAYERS) for sequence embeddings
        """
        if not isinstance(seqs, list):
            seqs = [seqs]
        if isinstance(seqs[0], (list, tuple)) and len(seqs[0]) == 2:
            _, seqs = zip(*seqs)
            seqs = list(seqs)
        res = list(self.featurizer.embed_many(seqs, **kwargs))
        res = [self._pool(x) for x in res]
        return res

    def __call__(
        self,
        seqs: List[str],
        ignore_errors: bool = False,
        enforce_dtype: bool = True,
        **kwargs,
    ):
        r"""
        Compute molecular representation of a protein sequence.
        If ignore_error is True, a list of features and valid ids are returned.

        Args:
            seqs: list of protein or nucleotide sequence as amino acids
            enforce_dtype: whether to enforce the instance dtype in the generated fingerprint
            ignore_errors: Whether to ignore errors during featurization or raise an error.
            kwargs: Named parameters for the transform method

        Returns:
            feats: list of valid embeddings
            ids: all valid positions that did not failed during featurization.
                Only returned when ignore_errors is True.

        """
        features = self.transform(seqs, **kwargs)
        ids = np.arange(len(features))
        if ignore_errors:
            features, ids = self._filter_none(features)
        if self.dtype is not None and enforce_dtype:
            if self._representation.startswith("token"):
                features = [
                    datatype.cast(feat, dtype=self.dtype, columns=self.columns) for feat in features
                ]
            else:
                features = datatype.cast(features, dtype=self.dtype, columns=self.columns)
        if not ignore_errors:
            return features
        return features, ids

`SUPPORTED_EMBEDDINGS = ['bepler', 'cpcprot', 'esm', 'esm1b', 'esm1v', 'fasttext', 'glove', 'one_hot_encoding', 'plus_rnn', 'prottrans_albert_bfd', 'prottrans_bert_bfd', 'prottrans_t5_bfd', 'prottrans_t5_uniref50', 'prottrans_t5_xl_u50', 'prottrans_xlnet_uniref100', 'seqvec', 'unirep', 'word2vec']` `class-attribute` `instance-attribute` ¶

`pooling = Pooling(dim=0, name=pooling)` `instance-attribute` ¶

`layer_pooling = Pooling(dim=0, name=layer_pooling)` `instance-attribute` ¶

`n_layers` `property` ¶

Get the number of layers used in this embedding

`init(featurizer: Union[str, Callable] = 'seqvec', pooling: str = 'mean', dtype: Callable = np.float32, device: Optional[Union[torch.device, str]] = None, layer_pooling: str = 'sum', **kwargs)` ¶

Constructor for Deep Learning based Protein representation. SeqVec featurizer will e

Parameters:

Name	Type	Description	Default
`featurizer`	`Union[str, Callable]`	Name of callable of the embedding model	`'seqvec'`
`pooling`	`str`	Pooling method to use for sequence embedding. Defaults to "mean". If you set pooling to None, token representation will be returned	`'mean'`
`dtype`	`Callable`	Representation output datatype. Defaults to None.	`float32`
`device`	`Optional[Union[device, str]]`	Torch device to move the model to. Defaults to None.	`None`
`layer_pooling`	`str`	Layer-wise pooling method to use when > 1 layer exists. Default to 'sum'. If None, last layers is taken. This is relevant for `seqvec` mostly	`'sum'`

Source code in molfeat/trans/struct/prot1D.py

def __init__(
    self,
    featurizer: Union[str, Callable] = "seqvec",
    pooling: str = "mean",
    dtype: Callable = np.float32,
    device: Optional[Union[torch.device, str]] = None,
    layer_pooling: str = "sum",
    **kwargs,
):
    """Constructor for Deep Learning based Protein representation.
    SeqVec featurizer will e

    Args:
        featurizer: Name of callable of the embedding model
        pooling: Pooling method to use for sequence embedding. Defaults to "mean".
            If you set pooling to None, token representation will be returned
        dtype: Representation output datatype. Defaults to None.
        device: Torch device to move the model to. Defaults to None.
        layer_pooling: Layer-wise pooling method to use when > 1 layer exists. Default to 'sum'.
            If None, last layers is taken. This is relevant for `seqvec` mostly
    """
    if not requires.check("bio_embeddings"):
        raise ValueError(
            "Cannot use this featurizer without bio_embeddings (pip install 'bio_embeddings[all]')."
        )

    if isinstance(featurizer, bio_embedder.EmbedderInterface):
        featurizer = featurizer
        self._model_name = self.featurizer.name
    else:
        if (
            not isinstance(featurizer, str)
            or featurizer.lower() not in self.SUPPORTED_EMBEDDINGS
        ):
            raise ValueError("Unknown featurizer: {}".format(featurizer))
        self._model_name = featurizer.lower()
        featurizer = bio_embedder.name_to_embedder[self._model_name](device=device, **kwargs)

    super().__init__(featurizer=featurizer, dtype=dtype, **kwargs)
    self._fitted = True
    self._representation = "seq"
    self.pooling = Pooling(dim=0, name=pooling)
    self.layer_pooling = Pooling(dim=0, name=layer_pooling)
    if pooling is None:
        self._representation = "token"
    self._feat_length = None

`len()` ¶

Get featurizer length

Source code in molfeat/trans/struct/prot1D.py

def __len__(self):
    """Get featurizer length"""
    return self.featurizer.embedding_dimension

`repr()` ¶

Source code in molfeat/trans/struct/prot1D.py

def __repr__(self):
    return "{}(model={}, pooling={}, dtype={})".format(
        self.__class__.__name__,
        _parse_to_evaluable_str(self._model_name),
        _parse_to_evaluable_str(self.pooling.name),
        _parse_to_evaluable_str(self.dtype),
    )

`transform(seqs: List[str], names: Optional[List[str]] = None, **kwargs)` ¶

Transform a list of protein/nucleotide sequence into a feature vector.

Parameters:

Name	Type	Description	Default
`seqs`	`List[str]`	list of protein/nucleotide sequence as amino acids	required
`names`	`Optional[List[str]]`	names of the macromolecules. Will be ignored	`None`
`kwargs`		additional arguments for the featurizer	`{}`

Returns:

Type	Description
	Embedding of size (N_SEQS, FEAT_DIM) for token embeddings and (FEAT_DIM, N_LAYERS) for sequence embeddings

Source code in molfeat/trans/struct/prot1D.py

def transform(self, seqs: List[str], names: Optional[List[str]] = None, **kwargs):
    """
    Transform a list of protein/nucleotide sequence into a feature vector.

    Args:
        seqs: list of protein/nucleotide sequence as amino acids
        names: names of the macromolecules.  Will be ignored
        kwargs: additional arguments for the featurizer

    Returns:
        Embedding of size (N_SEQS, FEAT_DIM) for token embeddings
            and (FEAT_DIM, N_LAYERS) for sequence embeddings
    """
    if not isinstance(seqs, list):
        seqs = [seqs]
    if isinstance(seqs[0], (list, tuple)) and len(seqs[0]) == 2:
        _, seqs = zip(*seqs)
        seqs = list(seqs)
    res = list(self.featurizer.embed_many(seqs, **kwargs))
    res = [self._pool(x) for x in res]
    return res

`call(seqs: List[str], ignore_errors: bool = False, enforce_dtype: bool = True, **kwargs)` ¶

Compute molecular representation of a protein sequence. If ignore_error is True, a list of features and valid ids are returned.

Parameters:

Name	Type	Description	Default
`seqs`	`List[str]`	list of protein or nucleotide sequence as amino acids	required
`enforce_dtype`	`bool`	whether to enforce the instance dtype in the generated fingerprint	`True`
`ignore_errors`	`bool`	Whether to ignore errors during featurization or raise an error.	`False`
`kwargs`		Named parameters for the transform method	`{}`

Returns:

Name	Type	Description
`feats`		list of valid embeddings
`ids`		all valid positions that did not failed during featurization. Only returned when ignore_errors is True.

Source code in molfeat/trans/struct/prot1D.py

def __call__(
    self,
    seqs: List[str],
    ignore_errors: bool = False,
    enforce_dtype: bool = True,
    **kwargs,
):
    r"""
    Compute molecular representation of a protein sequence.
    If ignore_error is True, a list of features and valid ids are returned.

    Args:
        seqs: list of protein or nucleotide sequence as amino acids
        enforce_dtype: whether to enforce the instance dtype in the generated fingerprint
        ignore_errors: Whether to ignore errors during featurization or raise an error.
        kwargs: Named parameters for the transform method

    Returns:
        feats: list of valid embeddings
        ids: all valid positions that did not failed during featurization.
            Only returned when ignore_errors is True.

    """
    features = self.transform(seqs, **kwargs)
    ids = np.arange(len(features))
    if ignore_errors:
        features, ids = self._filter_none(features)
    if self.dtype is not None and enforce_dtype:
        if self._representation.startswith("token"):
            features = [
                datatype.cast(feat, dtype=self.dtype, columns=self.columns) for feat in features
            ]
        else:
            features = datatype.cast(features, dtype=self.dtype, columns=self.columns)
    if not ignore_errors:
        return features
    return features, ids

molfeat.trans.struct

ESM¶

ESMProteinFingerprint ¶

device = device instance-attribute ¶

dtype = dtype instance-attribute ¶

featurizer = None instance-attribute ¶

alphabet = None instance-attribute ¶

batch_converter = None instance-attribute ¶

cols_to_keep = None instance-attribute ¶

repr_layers = layers instance-attribute ¶

repo_or_dir = loader_repo_or_dir instance-attribute ¶

contact = contact instance-attribute ¶

pooling = Pooling(dim=0, name=pooling) instance-attribute ¶

n_layers property ¶

__init__(featurizer: str = 'esm1b_t33_650M_UR50S', loader_repo_or_dir: str = 'facebookresearch/esm:main', device: Optional[str] = None, layers: List[int] = None, pooling: str = 'mean', dtype: Callable = None, contact: bool = False, **kwargs) ¶

__len__() ¶

__repr__() ¶

transform(seqs: List[str], names: Optional[List[str]] = None, **kwargs) ¶

__call__(seqs: List[str], names: Optional[List[str]] = None, ignore_errors: bool = False, enforce_dtype: bool = True, **kwargs) ¶

Bio Embeddings¶

ProtBioFingerprint ¶

pooling = Pooling(dim=0, name=pooling) instance-attribute ¶

layer_pooling = Pooling(dim=0, name=layer_pooling) instance-attribute ¶

n_layers property ¶

__init__(featurizer: Union[str, Callable] = 'seqvec', pooling: str = 'mean', dtype: Callable = np.float32, device: Optional[Union[torch.device, str]] = None, layer_pooling: str = 'sum', **kwargs) ¶

__len__() ¶

__repr__() ¶

transform(seqs: List[str], names: Optional[List[str]] = None, **kwargs) ¶

__call__(seqs: List[str], ignore_errors: bool = False, enforce_dtype: bool = True, **kwargs) ¶

`ESMProteinFingerprint` ¶

`device = device` `instance-attribute` ¶

`dtype = dtype` `instance-attribute` ¶

`featurizer = None` `instance-attribute` ¶

`alphabet = None` `instance-attribute` ¶

`batch_converter = None` `instance-attribute` ¶

`cols_to_keep = None` `instance-attribute` ¶

`repr_layers = layers` `instance-attribute` ¶

`repo_or_dir = loader_repo_or_dir` `instance-attribute` ¶

`contact = contact` `instance-attribute` ¶

`pooling = Pooling(dim=0, name=pooling)` `instance-attribute` ¶

`n_layers` `property` ¶

`init(featurizer: str = 'esm1b_t33_650M_UR50S', loader_repo_or_dir: str = 'facebookresearch/esm:main', device: Optional[str] = None, layers: List[int] = None, pooling: str = 'mean', dtype: Callable = None, contact: bool = False, **kwargs)` ¶

`len()` ¶

`repr()` ¶

`transform(seqs: List[str], names: Optional[List[str]] = None, **kwargs)` ¶

`call(seqs: List[str], names: Optional[List[str]] = None, ignore_errors: bool = False, enforce_dtype: bool = True, **kwargs)` ¶

`ProtBioFingerprint` ¶

`pooling = Pooling(dim=0, name=pooling)` `instance-attribute` ¶

`layer_pooling = Pooling(dim=0, name=layer_pooling)` `instance-attribute` ¶

`n_layers` `property` ¶

`init(featurizer: Union[str, Callable] = 'seqvec', pooling: str = 'mean', dtype: Callable = np.float32, device: Optional[Union[torch.device, str]] = None, layer_pooling: str = 'sum', **kwargs)` ¶

`len()` ¶

`repr()` ¶

`transform(seqs: List[str], names: Optional[List[str]] = None, **kwargs)` ¶

`call(seqs: List[str], ignore_errors: bool = False, enforce_dtype: bool = True, **kwargs)` ¶