Skip to content

Graphormer

Graphormer pretrained models

GraphormerTransformer

Bases: PretrainedMolTransformer

Graphormer transformer from microsoft, pretrained on PCQM4Mv2 quantum chemistry dataset for the prediction of homo-lumo gap.

Attributes:

Name Type Description
featurizer

Graphormer embedding object

dtype

Data type. Use call instead

pooling

Pooling method for Graphormer's embedding layer

Source code in molfeat/trans/pretrained/graphormer.py
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
class GraphormerTransformer(PretrainedMolTransformer):
    r"""
    Graphormer transformer from microsoft, pretrained on PCQM4Mv2 quantum chemistry dataset
    for the prediction of homo-lumo gap.

    Attributes:
        featurizer: Graphormer embedding object
        dtype: Data type. Use call instead
        pooling: Pooling method for Graphormer's embedding layer
    """

    def __init__(
        self,
        kind: str = "pcqm4mv2_graphormer_base",
        dtype: Callable = np.float32,
        pooling: str = "mean",
        max_length: Optional[int] = None,
        concat_layers: Union[List[int], int] = -1,
        ignore_padding: bool = True,
        version=None,
        **params,
    ):
        """
        Pretrained graphormer featurizer.

        !!! note
            The default behaviour of this feature extractor is to return the last hidden state of the encoder,
            averaged across all nodes (including the virtual node connected to all other nodes).

            For a different behaviour, please change the pooling method:
            * `graph` or `virtual`: use the virtual node embedding in the last layer to get the graph representation
            * `mean`, `max`, `sum`, etc or any other supported pooling of `molfeat.utils.pooler.Pooling`
                will take the operation defined by the pooling layer across all nodes of each graph

        Args:
            kind: name of the featurizer as available in the model store
            dtype: Data type to output
            pooling: type of pooling to use. One of ['graph', 'virtual', 'mean', 'max', 'sum']. The value "graph" corresponds to the virtual node representation
            max_length: Maximum length of the input sequence to consider. Please update this for large sequences
            concat_layers: Layer to concat to get the representation. By default the last hidden layer is returned.
            ignore_padding: Whether to ignore padding in the representation (default: True) to avoid effect of batching
            params: any other parameter to pass to PretrainedMolTransformer
        """

        super().__init__(dtype=dtype, pooling=pooling, **params)
        if not requires.check("graphormer_pretrained"):
            raise ValueError("`graphormer` is required to use this featurizer.")

        if concat_layers is None:
            concat_layers = -1
        if not isinstance(concat_layers, list):
            concat_layers = [concat_layers]
        self.concat_layers = concat_layers
        self.preload = True
        self.name = kind
        self._require_mols = False
        self.max_length = max_length
        self.ignore_padding = ignore_padding
        if isinstance(pooling, str):
            if pooling in Pooling.SUPPORTED_POOLING:
                pooling = Pooling(dim=1, name=pooling)
            else:
                pooling = None
        self.pooling = pooling
        self.featurizer = GraphormerEmbeddingsExtractor(
            pretrained_name=self.name, max_nodes=self.max_length, concat_layers=self.concat_layers
        )
        self.featurizer.config.max_nodes = self.max_length
        self.version = version

    def __repr__(self):
        return "{}(name={}, pooling={}, dtype={})".format(
            self.__class__.__name__,
            _parse_to_evaluable_str(self.name),
            _parse_to_evaluable_str(self.pooling.name),
            _parse_to_evaluable_str(self.dtype),
        )

    @staticmethod
    def list_available_models():
        """List available graphormer model to use"""
        return [
            "pcqm4mv1_graphormer_base",  # PCQM4Mv1
            "pcqm4mv2_graphormer_base",  # PCQM4Mv2
            "pcqm4mv1_graphormer_base_for_molhiv",  # ogbg-molhiv
            "oc20is2re_graphormer3d_base",  # Open Catalyst Challenge
        ]

    def _embed(self, inputs: List[str], **kwargs):
        """Internal molecular embedding

        Args:
            smiles: input smiles
        """
        with torch.no_grad():
            layer_reprs, graph_reprs, padding_mask = self.featurizer(inputs)
            if self.pooling is None:
                x = graph_reprs
            else:
                x = self.pooling(layer_reprs, mask=(padding_mask if self.ignore_padding else None))
        return x.numpy()

    def __getstate__(self):
        """Getting state to allow pickling"""
        d = copy.deepcopy(self.__dict__)
        d["precompute_cache"] = None
        d.pop("featurizer", None)
        return d

    def __setstate__(self, d):
        """Setting state during reloading pickling"""
        self.__dict__.update(d)
        self._update_params()

    def compute_max_length(self, inputs: list):
        """Compute maximum node number for the input list of molecules

        Args:
            inputs: input list of molecules
        """
        dataset = GraphormerInferenceDataset(
            inputs,
            multi_hop_max_dist=self.featurizer.config.multi_hop_max_dist,
            spatial_pos_max=self.featurizer.config.spatial_pos_max,
        )
        xs = [item.x.size(0) for item in dataset]
        return max(xs)

    def set_max_length(self, max_length: int):
        """Set the maximum length for this featurizer"""
        self.max_length = max_length
        self._update_params()
        self._preload()

    def _convert(self, inputs: list, **kwargs):
        """Convert molecules to the right format

        Args:
            inputs: inputs to preprocess

        Returns:
            processed: pre-processed input list
        """
        inputs = super()._convert(inputs, **kwargs)
        batch = self.featurizer._convert(inputs)
        return batch

    def _update_params(self):
        super()._update_params()
        self.featurizer = GraphormerEmbeddingsExtractor(
            pretrained_name=self.name, max_nodes=self.max_length
        )
        self.featurizer.config.max_nodes = self.max_length

__getstate__()

Getting state to allow pickling

Source code in molfeat/trans/pretrained/graphormer.py
121
122
123
124
125
126
def __getstate__(self):
    """Getting state to allow pickling"""
    d = copy.deepcopy(self.__dict__)
    d["precompute_cache"] = None
    d.pop("featurizer", None)
    return d

__init__(kind='pcqm4mv2_graphormer_base', dtype=np.float32, pooling='mean', max_length=None, concat_layers=-1, ignore_padding=True, version=None, **params)

Pretrained graphormer featurizer.

Note

The default behaviour of this feature extractor is to return the last hidden state of the encoder, averaged across all nodes (including the virtual node connected to all other nodes).

For a different behaviour, please change the pooling method: * graph or virtual: use the virtual node embedding in the last layer to get the graph representation * mean, max, sum, etc or any other supported pooling of molfeat.utils.pooler.Pooling will take the operation defined by the pooling layer across all nodes of each graph

Parameters:

Name Type Description Default
kind str

name of the featurizer as available in the model store

'pcqm4mv2_graphormer_base'
dtype Callable

Data type to output

float32
pooling str

type of pooling to use. One of ['graph', 'virtual', 'mean', 'max', 'sum']. The value "graph" corresponds to the virtual node representation

'mean'
max_length Optional[int]

Maximum length of the input sequence to consider. Please update this for large sequences

None
concat_layers Union[List[int], int]

Layer to concat to get the representation. By default the last hidden layer is returned.

-1
ignore_padding bool

Whether to ignore padding in the representation (default: True) to avoid effect of batching

True
params

any other parameter to pass to PretrainedMolTransformer

{}
Source code in molfeat/trans/pretrained/graphormer.py
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
def __init__(
    self,
    kind: str = "pcqm4mv2_graphormer_base",
    dtype: Callable = np.float32,
    pooling: str = "mean",
    max_length: Optional[int] = None,
    concat_layers: Union[List[int], int] = -1,
    ignore_padding: bool = True,
    version=None,
    **params,
):
    """
    Pretrained graphormer featurizer.

    !!! note
        The default behaviour of this feature extractor is to return the last hidden state of the encoder,
        averaged across all nodes (including the virtual node connected to all other nodes).

        For a different behaviour, please change the pooling method:
        * `graph` or `virtual`: use the virtual node embedding in the last layer to get the graph representation
        * `mean`, `max`, `sum`, etc or any other supported pooling of `molfeat.utils.pooler.Pooling`
            will take the operation defined by the pooling layer across all nodes of each graph

    Args:
        kind: name of the featurizer as available in the model store
        dtype: Data type to output
        pooling: type of pooling to use. One of ['graph', 'virtual', 'mean', 'max', 'sum']. The value "graph" corresponds to the virtual node representation
        max_length: Maximum length of the input sequence to consider. Please update this for large sequences
        concat_layers: Layer to concat to get the representation. By default the last hidden layer is returned.
        ignore_padding: Whether to ignore padding in the representation (default: True) to avoid effect of batching
        params: any other parameter to pass to PretrainedMolTransformer
    """

    super().__init__(dtype=dtype, pooling=pooling, **params)
    if not requires.check("graphormer_pretrained"):
        raise ValueError("`graphormer` is required to use this featurizer.")

    if concat_layers is None:
        concat_layers = -1
    if not isinstance(concat_layers, list):
        concat_layers = [concat_layers]
    self.concat_layers = concat_layers
    self.preload = True
    self.name = kind
    self._require_mols = False
    self.max_length = max_length
    self.ignore_padding = ignore_padding
    if isinstance(pooling, str):
        if pooling in Pooling.SUPPORTED_POOLING:
            pooling = Pooling(dim=1, name=pooling)
        else:
            pooling = None
    self.pooling = pooling
    self.featurizer = GraphormerEmbeddingsExtractor(
        pretrained_name=self.name, max_nodes=self.max_length, concat_layers=self.concat_layers
    )
    self.featurizer.config.max_nodes = self.max_length
    self.version = version

__setstate__(d)

Setting state during reloading pickling

Source code in molfeat/trans/pretrained/graphormer.py
128
129
130
131
def __setstate__(self, d):
    """Setting state during reloading pickling"""
    self.__dict__.update(d)
    self._update_params()

compute_max_length(inputs)

Compute maximum node number for the input list of molecules

Parameters:

Name Type Description Default
inputs list

input list of molecules

required
Source code in molfeat/trans/pretrained/graphormer.py
133
134
135
136
137
138
139
140
141
142
143
144
145
def compute_max_length(self, inputs: list):
    """Compute maximum node number for the input list of molecules

    Args:
        inputs: input list of molecules
    """
    dataset = GraphormerInferenceDataset(
        inputs,
        multi_hop_max_dist=self.featurizer.config.multi_hop_max_dist,
        spatial_pos_max=self.featurizer.config.spatial_pos_max,
    )
    xs = [item.x.size(0) for item in dataset]
    return max(xs)

list_available_models() staticmethod

List available graphormer model to use

Source code in molfeat/trans/pretrained/graphormer.py
 97
 98
 99
100
101
102
103
104
105
@staticmethod
def list_available_models():
    """List available graphormer model to use"""
    return [
        "pcqm4mv1_graphormer_base",  # PCQM4Mv1
        "pcqm4mv2_graphormer_base",  # PCQM4Mv2
        "pcqm4mv1_graphormer_base_for_molhiv",  # ogbg-molhiv
        "oc20is2re_graphormer3d_base",  # Open Catalyst Challenge
    ]

set_max_length(max_length)

Set the maximum length for this featurizer

Source code in molfeat/trans/pretrained/graphormer.py
147
148
149
150
151
def set_max_length(self, max_length: int):
    """Set the maximum length for this featurizer"""
    self.max_length = max_length
    self._update_params()
    self._preload()