Create your own featurizers
%load_ext autoreload
%autoreload 2
Define your own calculator¶
Remember that a calculator is simply a callable
that takes a molecule as input (either a RDKit Chem.Mol
object or SMILES string) and returns a dictionary of features.
We can thus easily define our own calculator!
import numpy as np
import datamol as dm
from molfeat.trans import MoleculeTransformer
from rdkit.Chem.rdMolDescriptors import CalcNumHeteroatoms
smiles = dm.freesolv()["smiles"].iloc[:3]
def my_calculator(mol):
"""My custom featurizer"""
mol = dm.to_mol(mol)
rng = np.random.default_rng(0)
return [mol.GetNumAtoms(), mol.GetNumBonds(), CalcNumHeteroatoms(mol), rng.random()]
# This directly works with the MoleculeTransformer
mol_transf = MoleculeTransformer(my_calculator)
mol_transf(smiles)
[array([13. , 13. , 3. , 0.63696169]), array([5. , 4. , 4. , 0.63696169]), array([5. , 4. , 0. , 0.63696169])]
If such functions get more complex, it might instead be easier to wrap it in a class. This also ensures the calculator remains serializable.
from molfeat.calc import SerializableCalculator
class MyCalculator(SerializableCalculator):
def __call__(self, mol):
mol = dm.to_mol(mol)
rng = np.random.default_rng(0)
return [mol.GetNumAtoms(), mol.GetNumBonds(), CalcNumHeteroatoms(mol), rng.random()]
mol_transf = MoleculeTransformer(MyCalculator())
mol_transf(smiles)
[array([13. , 13. , 3. , 0.63696169]), array([5. , 4. , 4. , 0.63696169]), array([5. , 4. , 0. , 0.63696169])]
If your calculator can perform featurization of a batch of molecules in an efficient way, then you should implement the optional batch_compute
method which will then be used by MoleculeTransformer
instead of the default sequential or parallelization process.
from molfeat.calc import SerializableCalculator
class MyBatchableCalculator(SerializableCalculator):
def __init__(self, random_seed=42, length=10):
self.random_seed = random_seed
self.length = length
self.rng = np.random.default_rng(self.random_seed)
def __call__(self, mol):
print("We are in single compute mode!")
return self.rng.random(self.length)
def __len__(self):
return self.length
def batch_compute(self, mols, **kwargs):
# note that dm.parallelized information is passed along with the molecules list
print("We are in batch mode!")
return self.rng.random((len(mols), self.length))
mol_transf = MoleculeTransformer(MyBatchableCalculator())
mol_transf(smiles)
We are in batch mode!
array([[0.77395605, 0.43887844, 0.85859792, 0.69736803, 0.09417735, 0.97562235, 0.7611397 , 0.78606431, 0.12811363, 0.45038594], [0.37079802, 0.92676499, 0.64386512, 0.82276161, 0.4434142 , 0.22723872, 0.55458479, 0.06381726, 0.82763117, 0.6316644 ], [0.75808774, 0.35452597, 0.97069802, 0.89312112, 0.7783835 , 0.19463871, 0.466721 , 0.04380377, 0.15428949, 0.68304895]])
Define your own transformer¶
The above example shows that in many cases, there's no direct need to create your own transformer class. You can simply use the MoleculeTransformer
base class.
In more complex cases, such as with pretrained models where batching would be advantageous, it is instead preferable to create your own subclass.
from sklearn.ensemble import RandomForestRegressor
from molfeat.trans.pretrained import PretrainedMolTransformer
class MyFoundationModel(PretrainedMolTransformer):
"""
In this dummy example, we train a RF model to predict the cLogP
then use the feature importance of the RF model as the embedding.
"""
def __init__(self):
super().__init__(dtype=np.float32)
self._featurizer = MoleculeTransformer("maccs", dtype=np.float32)
self._model = RandomForestRegressor()
self.train_dummy_model()
def train_dummy_model(self):
"""
Load the pretrained model.
In this dummy example, we train a RF model to predict the cLogP
"""
data = dm.data.freesolv().smiles.values
X = self._featurizer(data)
y = np.array([dm.descriptors.clogp(dm.to_mol(smi)) for smi in data])
self._model.fit(X, y)
def _convert(self, inputs: list, **kwargs):
"""Convert the molecule to a format that the model expects"""
return self._featurizer(inputs)
def _embed(self, mols: list, **kwargs):
"""
Embed the molecules using the pretrained model
In this dummy example, we simply multiply the features by the importance of the feature
"""
return [feats * self._model.feature_importances_ for feats in mols]
mol_transf = MyFoundationModel()
mol_transf(["CN1C=NC2=C1C(=O)N(C(=O)N2C)C"]).shape
(1, 167)
Here is another example that shows how to extend Molfeat with an existing embedding language model for astrochemistry.
pip install astrochem_embedding
import torch
import datamol as dm
from astrochem_embedding import VICGAE
from molfeat.trans.pretrained import PretrainedMolTransformer
class MyAstroChemFeaturizer(PretrainedMolTransformer):
"""
In this more practical example, we use embeddings from VICGAE a variance-invariance-covariance
regularized GRU autoencoder trained on SELFIES strings.
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.featurizer = VICGAE.from_pretrained()
def _embed(self, smiles, **kwargs):
return [self.featurizer.embed_smiles(x) for x in smiles]
transformer = MyAstroChemFeaturizer(dtype=torch.float)
transformer(dm.freesolv()["smiles"][:10]).shape
torch.Size([10, 32])
Add it to your Model Store¶
Molfeat has a Model Store to publish your models in a centralized location. The default is a read-only GCP bucket but you can replace this with your own file storage. This can, for example, be useful to share private featurizers with your team.
import platformdirs
from molfeat.store.modelstore import ModelStore
from molfeat.store import ModelInfo
path = dm.fs.join(platformdirs.user_cache_dir("molfeat"), "custom_model_store")
store = ModelStore(model_store_bucket=path)
len(store.available_models)
1
# Let's define our model's info
info = ModelInfo(
name = "my_foundation_model",
inputs = "smiles",
type="pretrained",
group="my_group",
version=0,
submitter="Datamol",
description="Solves chemistry!",
representation="vector",
require_3D=False,
tags = ["foundation_model", "random_forest"],
authors= ["Datamol"],
reference = "/fake/ref"
)
store.register(info)
store.available_models
0%| | 0.00/4.00 [00:00<?, ?B/s]
2023-04-07 18:59:03.363 | INFO | molfeat.store.modelstore:register:124 - Successfuly registered model my_foundation_model !
[ModelInfo(name='my_foundation_model', inputs='smiles', type='pretrained', version=0, group='my_group', submitter='Datamol', description='Solves chemistry!', representation='vector', require_3D=False, tags=['foundation_model', 'random_forest'], authors=['Datamol'], reference='/fake/ref', created_at=datetime.datetime(2023, 4, 7, 18, 59, 3, 312234), sha256sum='9c298d589a2158eb513cb52191144518a2acab2cb0c04f1df14fca0f712fa4a1')]
Share with the community¶
We invite you to share your featurizers with the community to help progress the field. To learn more, visit the developer documentation.