bindingcalculator

Calculate residual antibody binding after some mutations.

This module can be downloaded from https://github.com/jbloomlab/SARS2_RBD_Ab_escape_maps/blob/main/bindingcalculator.py

The module defines BindingCalculator which does the calculation.

Written by Jesse Bloom.

View Source

  1"""Calculate residual antibody binding after some mutations.
  2
  3This module can be downloaded from
  4`https://github.com/jbloomlab/SARS2_RBD_Ab_escape_maps/blob/main/bindingcalculator.py <https://github.com/jbloomlab/SARS2_RBD_Ab_escape_maps/blob/main/bindingcalculator.py>`_
  5
  6The module defines :class:`BindingCalculator` which does the calculation.
  7
  8Written by Jesse Bloom.
  9
 10"""
 11
 12__docformat__ = 'numpy'
 13
 14
 15import pandas as pd
 16
 17
 18class BindingCalculator:
 19    """Calculates residual polyclonal antibody binding after some mutations.
 20
 21    The calculator is the one implemented interactively at
 22    `https://jbloomlab.github.io/SARS2_RBD_Ab_escape_maps/escape-calc/ <https://jbloomlab.github.io/SARS2_RBD_Ab_escape_maps/escape-calc/>`_
 23
 24    Parameters
 25    ----------
 26    csv_or_url : str
 27        Path to CSV or URL of CSV containing the escape data. Should
 28        have columns 'condition', 'metric', and 'escape'.
 29    eliciting_virus : str
 30        Include antibodies elicited by these viruses.
 31    known_to_neutralize : str
 32        Include antibodies known to neutralize this virus.
 33    weight_by_log_IC50 : bool
 34        Weight antibodies by log IC50.
 35    mutation_escape_strength : float
 36        Scaling exponent :math:`s`; larger values mean stronger escape, see
 37        https://jbloomlab.github.io/SARS2_RBD_Ab_escape_maps/escape-calc/
 38
 39    Attributes
 40    ----------
 41    escape_data : pandas.DataFrame
 42        The data frame read from `csv_or_url` after filtering for specified
 43        `eliciting_virus` and `known_to_neutralize`.
 44    sites : set
 45        All sites for which we have escape data. We can only calculate effects
 46        of mutations at these sites.
 47    weight_by_log_IC50 : bool
 48        Value of `weight_by_log_IC50` passed as parameter.
 49
 50    Example
 51    -------
 52    Create escape calculator. You will generally just want to use the latest
 53    escape calculator data on GitHub, which it the default for `csv_to_url`.
 54    But here we use a local file for testing:
 55
 56    >>> bindcalc = BindingCalculator(csv_or_url='processed_data/escape_calculator_data.csv')
 57
 58    We can look at the escape data after geting the specified normalization
 59    and metric and then scaling each escape value relative to max for condition:
 60
 61    >>> bindcalc.escape_data.head()
 62      condition  site   escape neg_log_IC50  max_escape  scale_escape
 63    0      1-57   338  0.05792       7.3385      0.9521      0.060834
 64    1      1-57   359  0.01558       7.3385      0.9521      0.016364
 65    2      1-57   370  0.03169       7.3385      0.9521      0.033284
 66    3      1-57   394  0.01253       7.3385      0.9521      0.013160
 67    4      1-57   396  0.02160       7.3385      0.9521      0.022687
 68
 69    We can also check what sites have escape data. Here we just
 70    show min and max sites with data:
 71
 72    >>> min(bindcalc.sites)
 73    331
 74    >>> max(bindcalc.sites)
 75    531
 76
 77    Now calculate the fraction of all polyclonal antibody binding retained
 78    after some sites have been mutated. If no sites have been mutated, all
 79    binding is retained:
 80
 81    >>> bindcalc.binding_retained([])
 82    1.0
 83
 84    With mutation at site 484:
 85
 86    >>> round(bindcalc.binding_retained([484]), 3)
 87    0.817
 88
 89    With mutation at site 417 and 484:
 90
 91    >>> round(bindcalc.binding_retained([417, 484]), 3)
 92    0.733
 93
 94    If you have a data frame of variants, you can just map the
 95    calculation of the binding retained to a new column, like this:
 96
 97    >>> variants = pd.DataFrame([('Wuhan-Hu-1', []),
 98    ...                          ('B.1.351', [417, 484, 501]),
 99    ...                          ('B.1.1.7', [501]),
100    ...                          ('B.1.429', [452])],
101    ...                         columns=['variant', 'mutated RBD sites'])
102    >>> variants['binding_retained'] = (variants['mutated RBD sites']
103    ...                                 .map(bindcalc.binding_retained))
104    >>> variants.round(3)
105          variant mutated RBD sites  binding_retained
106    0  Wuhan-Hu-1                []             1.000
107    1     B.1.351   [417, 484, 501]             0.689
108    2     B.1.1.7             [501]             0.940
109    3     B.1.429             [452]             0.903
110
111    We can also calculate the escape remaining at each site after a mutation:
112
113    >>> bindcalc.escape_per_site([417, 484]).query('site in [484, 486, 490]')
114         site original_escape retained_escape
115    135   484        0.070931         0.00925
116    137   486        0.116009        0.103263
117    141   490        0.055852        0.022456
118
119    Now do the same but **not** weighting by log IC50:
120
121    >>> bindcalc_noweight = BindingCalculator(
122    ...     csv_or_url='processed_data/escape_calculator_data.csv',
123    ...     weight_by_log_IC50=False,
124    ... )
125
126    >>> bindcalc_noweight.binding_retained([])
127    1.0
128
129    >>> round(bindcalc_noweight.binding_retained([484]), 3)
130    0.836
131
132    >>> round(bindcalc_noweight.binding_retained([417, 484]), 3)
133    0.764
134
135    >>> bindcalc_noweight.escape_per_site([417, 484]).query('site in [484, 486, 490]').round(3)
136         site  original_escape  retained_escape
137    135   484            0.066            0.009
138    137   486            0.080            0.067
139    141   490            0.053            0.020
140
141    """
142    def __init__(self,
143        csv_or_url='https://raw.githubusercontent.com/jbloomlab/SARS2_RBD_Ab_escape_maps/main/processed_data/escape_calculator_data.csv',
144        *,
145        eliciting_virus='SARS-CoV-2',
146        known_to_neutralize="any",
147        weight_by_log_IC50=True,
148        mutation_escape_strength=2,
149    ):
150        """See main class docstring."""
151        # read escape data 
152        self.escape_data = (
153            pd.read_csv(csv_or_url)
154            .assign(
155                eliciting_virus=lambda x: x["eliciting_virus"].str.split(";"),
156                known_to_neutralize=lambda x: x["known_to_neutralize"].str.split(";"),
157                neg_log_IC50=lambda x: x["neg_log_IC50"].map(
158                    lambda s: tuple([pd.NA if si == "NA" else float(si) for si in s.split(";")])
159                )
160            )
161            .explode("eliciting_virus")
162            .explode(["known_to_neutralize", "neg_log_IC50"])
163        )
164        assert all(self.escape_data["neg_log_IC50"] >= 0)
165
166        # make sure escape data has expected columns
167        if not set(self.escape_data.columns).issuperset({'condition',
168                                                         'site',
169                                                         'escape',
170                                                         'eliciting_virus',
171                                                         "known_to_neutralize",
172                                                         "neg_log_IC50",
173                                                         }):
174            raise ValueError(f"{self.escape_data.columns=} lacks expected columns")
175
176        # filter by virus
177        eliciting_viruses = set(self.escape_data["eliciting_virus"])
178        if eliciting_virus not in eliciting_viruses:
179            raise ValueError(f"{eliciting_virus=} not in {eliciting_viruses=}")
180        self.escape_data = self.escape_data.query('eliciting_virus == @eliciting_virus').drop(
181            columns="eliciting_virus"
182        )
183        assert len(self.escape_data) == len(self.escape_data.drop_duplicates())
184
185        # filter by known_to_neutralize
186        if known_to_neutralize not in set(self.escape_data['known_to_neutralize']):
187            raise ValueError(f"invalid {known_to_neutralize=}")
188        self.escape_data = (
189            self.escape_data
190            .query("known_to_neutralize == @known_to_neutralize")
191            .drop(columns="known_to_neutralize")
192        )
193        assert len(self.escape_data) == len(self.escape_data.drop_duplicates())
194
195        # get escape scaled by the max escape for that condition
196        self.escape_data = (
197                self.escape_data
198                .assign(max_escape=lambda x: (x.groupby('condition')
199                                              ['escape']
200                                              .transform('max')
201                                              ),
202                        scale_escape=lambda x: x['escape'] / x['max_escape'],
203                        )
204                )
205
206        # get all sites for which we have escape data
207        self.sites = set(self.escape_data['site'])
208
209        # set mutation escape strength
210        self.mutation_escape_strength = mutation_escape_strength
211
212        # do we weight by log IC50?
213        self.weight_by_log_IC50 = weight_by_log_IC50
214
215        assert (
216            self.escape_data["condition"].nunique()
217            == len(self.escape_data[["condition", "neg_log_IC50"]].drop_duplicates())
218        )
219
220        # number of conditions (antibodies), weighting by negative log IC50 if doing that
221        if self.weight_by_log_IC50:
222            self._n_conditions = (
223                self.escape_data
224                [["condition", "neg_log_IC50"]]
225                .drop_duplicates()
226                ["neg_log_IC50"]
227                .sum()
228            )
229        else:
230            self._n_conditions = self.escape_data['condition'].nunique()
231
232    def escape_per_site(self, mutated_sites):
233        """Escape at each site after mutating indicated sites.
234
235        Parameters
236        ----------
237        mutated_sites : array-like of integers
238            List of mutated sites, must all be in :attr:`BindingCalculator.sites`.
239
240        Returns
241        -------
242        pandas.DataFrame
243            For each site, gives the original escape and the escape
244            retained after mutations.
245
246        """
247        mutated_sites = set(mutated_sites)
248        if not mutated_sites.issubset(self.sites):
249            raise ValueError(f"invalid sites: {mutated_sites - self.sites}")
250        df = (
251            self.escape_data
252            .assign(
253                mutated=lambda x: x['site'].isin(mutated_sites).astype(int),
254                site_bind_retain=lambda x: 1 - x['scale_escape'] * x['mutated']
255                )
256            .groupby(['condition', "neg_log_IC50"], as_index=False)
257            .aggregate(cond_bind_retain=pd.NamedAgg('site_bind_retain', "prod"))
258            .assign(
259                cond_bind_retain=lambda x: x["cond_bind_retain"].pow(self.mutation_escape_strength),
260                weight=lambda x: x["neg_log_IC50"] if self.weight_by_log_IC50 else 1,
261            )
262            [["condition", 'cond_bind_retain', "weight"]]
263            .merge(self.escape_data[['condition', 'site', 'escape']])
264            .assign(
265                escape=lambda x: x["escape"] * x["weight"],
266                retained_escape=lambda x: x['cond_bind_retain'] * x['escape'],
267            )
268            .groupby('site')
269            .aggregate(
270                original_escape=pd.NamedAgg('escape', 'sum'),
271                retained_escape=pd.NamedAgg('retained_escape', 'sum'),
272            )
273        ) / self._n_conditions
274        return df.reset_index()
275
276    def binding_retained(self, mutated_sites):
277        """Fraction binding retained after mutating indicated sites.
278
279        Parameters
280        ----------
281        mutated_sites : array-like of integers
282            List of mutated sites, must all be in :attr:`BindingCalculator.sites`.
283
284        Returns
285        -------
286        float
287            The fraction binding retained after these mutations.
288
289        """
290        mutated_sites = set(mutated_sites)
291        if not mutated_sites.issubset(self.sites):
292            raise ValueError(f"invalid sites: {mutated_sites - self.sites}")
293        binding_retained = (
294            self.escape_data
295            .assign(
296                mutated=lambda x: x['site'].isin(mutated_sites).astype(int),
297                site_bind_retain=lambda x: 1 - x['scale_escape'] * x['mutated'],
298            )
299            .groupby(['condition', 'neg_log_IC50'], as_index=False)
300            .aggregate(cond_bind_retain=pd.NamedAgg('site_bind_retain', "prod"))
301            .assign(
302                cond_bind_retain=lambda x: x["cond_bind_retain"].pow(self.mutation_escape_strength),
303                weight=lambda x: x["neg_log_IC50"] if self.weight_by_log_IC50 else 1,
304                weighted_cond_bind_retain=lambda x: x["cond_bind_retain"] * x["weight"],
305            )
306            ['weighted_cond_bind_retain']
307            .sum()
308        ) / self._n_conditions
309        return binding_retained
310
311
312if __name__ == '__main__':
313    import doctest
314    doctest.testmod()

class BindingCalculator: View Source

 19class BindingCalculator:
 20    """Calculates residual polyclonal antibody binding after some mutations.
 21
 22    The calculator is the one implemented interactively at
 23    `https://jbloomlab.github.io/SARS2_RBD_Ab_escape_maps/escape-calc/ <https://jbloomlab.github.io/SARS2_RBD_Ab_escape_maps/escape-calc/>`_
 24
 25    Parameters
 26    ----------
 27    csv_or_url : str
 28        Path to CSV or URL of CSV containing the escape data. Should
 29        have columns 'condition', 'metric', and 'escape'.
 30    eliciting_virus : str
 31        Include antibodies elicited by these viruses.
 32    known_to_neutralize : str
 33        Include antibodies known to neutralize this virus.
 34    weight_by_log_IC50 : bool
 35        Weight antibodies by log IC50.
 36    mutation_escape_strength : float
 37        Scaling exponent :math:`s`; larger values mean stronger escape, see
 38        https://jbloomlab.github.io/SARS2_RBD_Ab_escape_maps/escape-calc/
 39
 40    Attributes
 41    ----------
 42    escape_data : pandas.DataFrame
 43        The data frame read from `csv_or_url` after filtering for specified
 44        `eliciting_virus` and `known_to_neutralize`.
 45    sites : set
 46        All sites for which we have escape data. We can only calculate effects
 47        of mutations at these sites.
 48    weight_by_log_IC50 : bool
 49        Value of `weight_by_log_IC50` passed as parameter.
 50
 51    Example
 52    -------
 53    Create escape calculator. You will generally just want to use the latest
 54    escape calculator data on GitHub, which it the default for `csv_to_url`.
 55    But here we use a local file for testing:
 56
 57    >>> bindcalc = BindingCalculator(csv_or_url='processed_data/escape_calculator_data.csv')
 58
 59    We can look at the escape data after geting the specified normalization
 60    and metric and then scaling each escape value relative to max for condition:
 61
 62    >>> bindcalc.escape_data.head()
 63      condition  site   escape neg_log_IC50  max_escape  scale_escape
 64    0      1-57   338  0.05792       7.3385      0.9521      0.060834
 65    1      1-57   359  0.01558       7.3385      0.9521      0.016364
 66    2      1-57   370  0.03169       7.3385      0.9521      0.033284
 67    3      1-57   394  0.01253       7.3385      0.9521      0.013160
 68    4      1-57   396  0.02160       7.3385      0.9521      0.022687
 69
 70    We can also check what sites have escape data. Here we just
 71    show min and max sites with data:
 72
 73    >>> min(bindcalc.sites)
 74    331
 75    >>> max(bindcalc.sites)
 76    531
 77
 78    Now calculate the fraction of all polyclonal antibody binding retained
 79    after some sites have been mutated. If no sites have been mutated, all
 80    binding is retained:
 81
 82    >>> bindcalc.binding_retained([])
 83    1.0
 84
 85    With mutation at site 484:
 86
 87    >>> round(bindcalc.binding_retained([484]), 3)
 88    0.817
 89
 90    With mutation at site 417 and 484:
 91
 92    >>> round(bindcalc.binding_retained([417, 484]), 3)
 93    0.733
 94
 95    If you have a data frame of variants, you can just map the
 96    calculation of the binding retained to a new column, like this:
 97
 98    >>> variants = pd.DataFrame([('Wuhan-Hu-1', []),
 99    ...                          ('B.1.351', [417, 484, 501]),
100    ...                          ('B.1.1.7', [501]),
101    ...                          ('B.1.429', [452])],
102    ...                         columns=['variant', 'mutated RBD sites'])
103    >>> variants['binding_retained'] = (variants['mutated RBD sites']
104    ...                                 .map(bindcalc.binding_retained))
105    >>> variants.round(3)
106          variant mutated RBD sites  binding_retained
107    0  Wuhan-Hu-1                []             1.000
108    1     B.1.351   [417, 484, 501]             0.689
109    2     B.1.1.7             [501]             0.940
110    3     B.1.429             [452]             0.903
111
112    We can also calculate the escape remaining at each site after a mutation:
113
114    >>> bindcalc.escape_per_site([417, 484]).query('site in [484, 486, 490]')
115         site original_escape retained_escape
116    135   484        0.070931         0.00925
117    137   486        0.116009        0.103263
118    141   490        0.055852        0.022456
119
120    Now do the same but **not** weighting by log IC50:
121
122    >>> bindcalc_noweight = BindingCalculator(
123    ...     csv_or_url='processed_data/escape_calculator_data.csv',
124    ...     weight_by_log_IC50=False,
125    ... )
126
127    >>> bindcalc_noweight.binding_retained([])
128    1.0
129
130    >>> round(bindcalc_noweight.binding_retained([484]), 3)
131    0.836
132
133    >>> round(bindcalc_noweight.binding_retained([417, 484]), 3)
134    0.764
135
136    >>> bindcalc_noweight.escape_per_site([417, 484]).query('site in [484, 486, 490]').round(3)
137         site  original_escape  retained_escape
138    135   484            0.066            0.009
139    137   486            0.080            0.067
140    141   490            0.053            0.020
141
142    """
143    def __init__(self,
144        csv_or_url='https://raw.githubusercontent.com/jbloomlab/SARS2_RBD_Ab_escape_maps/main/processed_data/escape_calculator_data.csv',
145        *,
146        eliciting_virus='SARS-CoV-2',
147        known_to_neutralize="any",
148        weight_by_log_IC50=True,
149        mutation_escape_strength=2,
150    ):
151        """See main class docstring."""
152        # read escape data 
153        self.escape_data = (
154            pd.read_csv(csv_or_url)
155            .assign(
156                eliciting_virus=lambda x: x["eliciting_virus"].str.split(";"),
157                known_to_neutralize=lambda x: x["known_to_neutralize"].str.split(";"),
158                neg_log_IC50=lambda x: x["neg_log_IC50"].map(
159                    lambda s: tuple([pd.NA if si == "NA" else float(si) for si in s.split(";")])
160                )
161            )
162            .explode("eliciting_virus")
163            .explode(["known_to_neutralize", "neg_log_IC50"])
164        )
165        assert all(self.escape_data["neg_log_IC50"] >= 0)
166
167        # make sure escape data has expected columns
168        if not set(self.escape_data.columns).issuperset({'condition',
169                                                         'site',
170                                                         'escape',
171                                                         'eliciting_virus',
172                                                         "known_to_neutralize",
173                                                         "neg_log_IC50",
174                                                         }):
175            raise ValueError(f"{self.escape_data.columns=} lacks expected columns")
176
177        # filter by virus
178        eliciting_viruses = set(self.escape_data["eliciting_virus"])
179        if eliciting_virus not in eliciting_viruses:
180            raise ValueError(f"{eliciting_virus=} not in {eliciting_viruses=}")
181        self.escape_data = self.escape_data.query('eliciting_virus == @eliciting_virus').drop(
182            columns="eliciting_virus"
183        )
184        assert len(self.escape_data) == len(self.escape_data.drop_duplicates())
185
186        # filter by known_to_neutralize
187        if known_to_neutralize not in set(self.escape_data['known_to_neutralize']):
188            raise ValueError(f"invalid {known_to_neutralize=}")
189        self.escape_data = (
190            self.escape_data
191            .query("known_to_neutralize == @known_to_neutralize")
192            .drop(columns="known_to_neutralize")
193        )
194        assert len(self.escape_data) == len(self.escape_data.drop_duplicates())
195
196        # get escape scaled by the max escape for that condition
197        self.escape_data = (
198                self.escape_data
199                .assign(max_escape=lambda x: (x.groupby('condition')
200                                              ['escape']
201                                              .transform('max')
202                                              ),
203                        scale_escape=lambda x: x['escape'] / x['max_escape'],
204                        )
205                )
206
207        # get all sites for which we have escape data
208        self.sites = set(self.escape_data['site'])
209
210        # set mutation escape strength
211        self.mutation_escape_strength = mutation_escape_strength
212
213        # do we weight by log IC50?
214        self.weight_by_log_IC50 = weight_by_log_IC50
215
216        assert (
217            self.escape_data["condition"].nunique()
218            == len(self.escape_data[["condition", "neg_log_IC50"]].drop_duplicates())
219        )
220
221        # number of conditions (antibodies), weighting by negative log IC50 if doing that
222        if self.weight_by_log_IC50:
223            self._n_conditions = (
224                self.escape_data
225                [["condition", "neg_log_IC50"]]
226                .drop_duplicates()
227                ["neg_log_IC50"]
228                .sum()
229            )
230        else:
231            self._n_conditions = self.escape_data['condition'].nunique()
232
233    def escape_per_site(self, mutated_sites):
234        """Escape at each site after mutating indicated sites.
235
236        Parameters
237        ----------
238        mutated_sites : array-like of integers
239            List of mutated sites, must all be in :attr:`BindingCalculator.sites`.
240
241        Returns
242        -------
243        pandas.DataFrame
244            For each site, gives the original escape and the escape
245            retained after mutations.
246
247        """
248        mutated_sites = set(mutated_sites)
249        if not mutated_sites.issubset(self.sites):
250            raise ValueError(f"invalid sites: {mutated_sites - self.sites}")
251        df = (
252            self.escape_data
253            .assign(
254                mutated=lambda x: x['site'].isin(mutated_sites).astype(int),
255                site_bind_retain=lambda x: 1 - x['scale_escape'] * x['mutated']
256                )
257            .groupby(['condition', "neg_log_IC50"], as_index=False)
258            .aggregate(cond_bind_retain=pd.NamedAgg('site_bind_retain', "prod"))
259            .assign(
260                cond_bind_retain=lambda x: x["cond_bind_retain"].pow(self.mutation_escape_strength),
261                weight=lambda x: x["neg_log_IC50"] if self.weight_by_log_IC50 else 1,
262            )
263            [["condition", 'cond_bind_retain', "weight"]]
264            .merge(self.escape_data[['condition', 'site', 'escape']])
265            .assign(
266                escape=lambda x: x["escape"] * x["weight"],
267                retained_escape=lambda x: x['cond_bind_retain'] * x['escape'],
268            )
269            .groupby('site')
270            .aggregate(
271                original_escape=pd.NamedAgg('escape', 'sum'),
272                retained_escape=pd.NamedAgg('retained_escape', 'sum'),
273            )
274        ) / self._n_conditions
275        return df.reset_index()
276
277    def binding_retained(self, mutated_sites):
278        """Fraction binding retained after mutating indicated sites.
279
280        Parameters
281        ----------
282        mutated_sites : array-like of integers
283            List of mutated sites, must all be in :attr:`BindingCalculator.sites`.
284
285        Returns
286        -------
287        float
288            The fraction binding retained after these mutations.
289
290        """
291        mutated_sites = set(mutated_sites)
292        if not mutated_sites.issubset(self.sites):
293            raise ValueError(f"invalid sites: {mutated_sites - self.sites}")
294        binding_retained = (
295            self.escape_data
296            .assign(
297                mutated=lambda x: x['site'].isin(mutated_sites).astype(int),
298                site_bind_retain=lambda x: 1 - x['scale_escape'] * x['mutated'],
299            )
300            .groupby(['condition', 'neg_log_IC50'], as_index=False)
301            .aggregate(cond_bind_retain=pd.NamedAgg('site_bind_retain', "prod"))
302            .assign(
303                cond_bind_retain=lambda x: x["cond_bind_retain"].pow(self.mutation_escape_strength),
304                weight=lambda x: x["neg_log_IC50"] if self.weight_by_log_IC50 else 1,
305                weighted_cond_bind_retain=lambda x: x["cond_bind_retain"] * x["weight"],
306            )
307            ['weighted_cond_bind_retain']
308            .sum()
309        ) / self._n_conditions
310        return binding_retained

Calculates residual polyclonal antibody binding after some mutations.

The calculator is the one implemented interactively at https://jbloomlab.github.io/SARS2_RBD_Ab_escape_maps/escape-calc/

Parameters

csv_or_url (str): Path to CSV or URL of CSV containing the escape data. Should have columns 'condition', 'metric', and 'escape'.
eliciting_virus (str): Include antibodies elicited by these viruses.
known_to_neutralize (str): Include antibodies known to neutralize this virus.
weight_by_log_IC50 (bool): Weight antibodies by log IC50.
mutation_escape_strength (float): Scaling exponent \( s \); larger values mean stronger escape, see https://jbloomlab.github.io/SARS2_RBD_Ab_escape_maps/escape-calc/

Attributes

escape_data (pandas.DataFrame): The data frame read from csv_or_url after filtering for specified eliciting_virus and known_to_neutralize.
sites (set): All sites for which we have escape data. We can only calculate effects of mutations at these sites.
weight_by_log_IC50 (bool): Value of weight_by_log_IC50 passed as parameter.

Example

Create escape calculator. You will generally just want to use the latest escape calculator data on GitHub, which it the default for csv_to_url. But here we use a local file for testing:

>>> bindcalc = BindingCalculator(csv_or_url='processed_data/escape_calculator_data.csv')

We can look at the escape data after geting the specified normalization and metric and then scaling each escape value relative to max for condition:

>>> bindcalc.escape_data.head()
  condition  site   escape neg_log_IC50  max_escape  scale_escape
0      1-57   338  0.05792       7.3385      0.9521      0.060834
1      1-57   359  0.01558       7.3385      0.9521      0.016364
2      1-57   370  0.03169       7.3385      0.9521      0.033284
3      1-57   394  0.01253       7.3385      0.9521      0.013160
4      1-57   396  0.02160       7.3385      0.9521      0.022687

We can also check what sites have escape data. Here we just show min and max sites with data:

>>> min(bindcalc.sites)
331
>>> max(bindcalc.sites)
531

Now calculate the fraction of all polyclonal antibody binding retained after some sites have been mutated. If no sites have been mutated, all binding is retained:

>>> bindcalc.binding_retained([])
1.0

With mutation at site 484:

>>> round(bindcalc.binding_retained([484]), 3)
0.817

With mutation at site 417 and 484:

>>> round(bindcalc.binding_retained([417, 484]), 3)
0.733

If you have a data frame of variants, you can just map the calculation of the binding retained to a new column, like this:

>>> variants = pd.DataFrame([('Wuhan-Hu-1', []),
...                          ('B.1.351', [417, 484, 501]),
...                          ('B.1.1.7', [501]),
...                          ('B.1.429', [452])],
...                         columns=['variant', 'mutated RBD sites'])
>>> variants['binding_retained'] = (variants['mutated RBD sites']
...                                 .map(bindcalc.binding_retained))
>>> variants.round(3)
      variant mutated RBD sites  binding_retained
0  Wuhan-Hu-1                []             1.000
1     B.1.351   [417, 484, 501]             0.689
2     B.1.1.7             [501]             0.940
3     B.1.429             [452]             0.903

We can also calculate the escape remaining at each site after a mutation:

>>> bindcalc.escape_per_site([417, 484]).query('site in [484, 486, 490]')
     site original_escape retained_escape
135   484        0.070931         0.00925
137   486        0.116009        0.103263
141   490        0.055852        0.022456

Now do the same but not weighting by log IC50:

>>> bindcalc_noweight = BindingCalculator(
...     csv_or_url='processed_data/escape_calculator_data.csv',
...     weight_by_log_IC50=False,
... )

>>> bindcalc_noweight.binding_retained([])
1.0

>>> round(bindcalc_noweight.binding_retained([484]), 3)
0.836

>>> round(bindcalc_noweight.binding_retained([417, 484]), 3)
0.764

>>> bindcalc_noweight.escape_per_site([417, 484]).query('site in [484, 486, 490]').round(3)
     site  original_escape  retained_escape
135   484            0.066            0.009
137   486            0.080            0.067
141   490            0.053            0.020

BindingCalculator( csv_or_url='https://raw.githubusercontent.com/jbloomlab/SARS2_RBD_Ab_escape_maps/main/processed_data/escape_calculator_data.csv', *, eliciting_virus='SARS-CoV-2', known_to_neutralize='any', weight_by_log_IC50=True, mutation_escape_strength=2) View Source

143    def __init__(self,
144        csv_or_url='https://raw.githubusercontent.com/jbloomlab/SARS2_RBD_Ab_escape_maps/main/processed_data/escape_calculator_data.csv',
145        *,
146        eliciting_virus='SARS-CoV-2',
147        known_to_neutralize="any",
148        weight_by_log_IC50=True,
149        mutation_escape_strength=2,
150    ):
151        """See main class docstring."""
152        # read escape data 
153        self.escape_data = (
154            pd.read_csv(csv_or_url)
155            .assign(
156                eliciting_virus=lambda x: x["eliciting_virus"].str.split(";"),
157                known_to_neutralize=lambda x: x["known_to_neutralize"].str.split(";"),
158                neg_log_IC50=lambda x: x["neg_log_IC50"].map(
159                    lambda s: tuple([pd.NA if si == "NA" else float(si) for si in s.split(";")])
160                )
161            )
162            .explode("eliciting_virus")
163            .explode(["known_to_neutralize", "neg_log_IC50"])
164        )
165        assert all(self.escape_data["neg_log_IC50"] >= 0)
166
167        # make sure escape data has expected columns
168        if not set(self.escape_data.columns).issuperset({'condition',
169                                                         'site',
170                                                         'escape',
171                                                         'eliciting_virus',
172                                                         "known_to_neutralize",
173                                                         "neg_log_IC50",
174                                                         }):
175            raise ValueError(f"{self.escape_data.columns=} lacks expected columns")
176
177        # filter by virus
178        eliciting_viruses = set(self.escape_data["eliciting_virus"])
179        if eliciting_virus not in eliciting_viruses:
180            raise ValueError(f"{eliciting_virus=} not in {eliciting_viruses=}")
181        self.escape_data = self.escape_data.query('eliciting_virus == @eliciting_virus').drop(
182            columns="eliciting_virus"
183        )
184        assert len(self.escape_data) == len(self.escape_data.drop_duplicates())
185
186        # filter by known_to_neutralize
187        if known_to_neutralize not in set(self.escape_data['known_to_neutralize']):
188            raise ValueError(f"invalid {known_to_neutralize=}")
189        self.escape_data = (
190            self.escape_data
191            .query("known_to_neutralize == @known_to_neutralize")
192            .drop(columns="known_to_neutralize")
193        )
194        assert len(self.escape_data) == len(self.escape_data.drop_duplicates())
195
196        # get escape scaled by the max escape for that condition
197        self.escape_data = (
198                self.escape_data
199                .assign(max_escape=lambda x: (x.groupby('condition')
200                                              ['escape']
201                                              .transform('max')
202                                              ),
203                        scale_escape=lambda x: x['escape'] / x['max_escape'],
204                        )
205                )
206
207        # get all sites for which we have escape data
208        self.sites = set(self.escape_data['site'])
209
210        # set mutation escape strength
211        self.mutation_escape_strength = mutation_escape_strength
212
213        # do we weight by log IC50?
214        self.weight_by_log_IC50 = weight_by_log_IC50
215
216        assert (
217            self.escape_data["condition"].nunique()
218            == len(self.escape_data[["condition", "neg_log_IC50"]].drop_duplicates())
219        )
220
221        # number of conditions (antibodies), weighting by negative log IC50 if doing that
222        if self.weight_by_log_IC50:
223            self._n_conditions = (
224                self.escape_data
225                [["condition", "neg_log_IC50"]]
226                .drop_duplicates()
227                ["neg_log_IC50"]
228                .sum()
229            )
230        else:
231            self._n_conditions = self.escape_data['condition'].nunique()

See main class docstring.

def escape_per_site(self, mutated_sites) View Source

233    def escape_per_site(self, mutated_sites):
234        """Escape at each site after mutating indicated sites.
235
236        Parameters
237        ----------
238        mutated_sites : array-like of integers
239            List of mutated sites, must all be in :attr:`BindingCalculator.sites`.
240
241        Returns
242        -------
243        pandas.DataFrame
244            For each site, gives the original escape and the escape
245            retained after mutations.
246
247        """
248        mutated_sites = set(mutated_sites)
249        if not mutated_sites.issubset(self.sites):
250            raise ValueError(f"invalid sites: {mutated_sites - self.sites}")
251        df = (
252            self.escape_data
253            .assign(
254                mutated=lambda x: x['site'].isin(mutated_sites).astype(int),
255                site_bind_retain=lambda x: 1 - x['scale_escape'] * x['mutated']
256                )
257            .groupby(['condition', "neg_log_IC50"], as_index=False)
258            .aggregate(cond_bind_retain=pd.NamedAgg('site_bind_retain', "prod"))
259            .assign(
260                cond_bind_retain=lambda x: x["cond_bind_retain"].pow(self.mutation_escape_strength),
261                weight=lambda x: x["neg_log_IC50"] if self.weight_by_log_IC50 else 1,
262            )
263            [["condition", 'cond_bind_retain', "weight"]]
264            .merge(self.escape_data[['condition', 'site', 'escape']])
265            .assign(
266                escape=lambda x: x["escape"] * x["weight"],
267                retained_escape=lambda x: x['cond_bind_retain'] * x['escape'],
268            )
269            .groupby('site')
270            .aggregate(
271                original_escape=pd.NamedAgg('escape', 'sum'),
272                retained_escape=pd.NamedAgg('retained_escape', 'sum'),
273            )
274        ) / self._n_conditions
275        return df.reset_index()

Escape at each site after mutating indicated sites.

Parameters

mutated_sites (array-like of integers): List of mutated sites, must all be in BindingCalculator.sites.

Returns

pandas.DataFrame: For each site, gives the original escape and the escape retained after mutations.

def binding_retained(self, mutated_sites) View Source

277    def binding_retained(self, mutated_sites):
278        """Fraction binding retained after mutating indicated sites.
279
280        Parameters
281        ----------
282        mutated_sites : array-like of integers
283            List of mutated sites, must all be in :attr:`BindingCalculator.sites`.
284
285        Returns
286        -------
287        float
288            The fraction binding retained after these mutations.
289
290        """
291        mutated_sites = set(mutated_sites)
292        if not mutated_sites.issubset(self.sites):
293            raise ValueError(f"invalid sites: {mutated_sites - self.sites}")
294        binding_retained = (
295            self.escape_data
296            .assign(
297                mutated=lambda x: x['site'].isin(mutated_sites).astype(int),
298                site_bind_retain=lambda x: 1 - x['scale_escape'] * x['mutated'],
299            )
300            .groupby(['condition', 'neg_log_IC50'], as_index=False)
301            .aggregate(cond_bind_retain=pd.NamedAgg('site_bind_retain', "prod"))
302            .assign(
303                cond_bind_retain=lambda x: x["cond_bind_retain"].pow(self.mutation_escape_strength),
304                weight=lambda x: x["neg_log_IC50"] if self.weight_by_log_IC50 else 1,
305                weighted_cond_bind_retain=lambda x: x["cond_bind_retain"] * x["weight"],
306            )
307            ['weighted_cond_bind_retain']
308            .sum()
309        ) / self._n_conditions
310        return binding_retained

Fraction binding retained after mutating indicated sites.

Parameters

mutated_sites (array-like of integers): List of mutated sites, must all be in BindingCalculator.sites.

Returns

float: The fraction binding retained after these mutations.