Source code for dms_tools2.neutcurve

"""
=============
neutcurve
=============
Module for fitting neutralization curves.
"""


import math
import io
import collections

import pandas
import scipy
import scipy.optimize

import dms_tools2.plot


[docs]def fit_fourParamLogistics(
        neutdata,
        plotfile,
        fixbottom=False,
        fixtop=1,
        xlabel='antibody concentration',
        ylabel='fraction surviving',
        maxcol=3,
        ic50_in_name=True,
        nfitpoints=100,
        cextend=2):
    """Fits and plots set of neutralization curves.

    Fits a set of 4-parameter logistic neutralization curves
    as defined by :class:`fourParamLogistic`. Then makes a faceted
    plot showing all these curves.

    Args:
        `neutdata` (panda DataFrame)
            Data to fit. There should be a column
            named `concentration` and every other column
            should be named by the sample and give the 
            fraction surviving at each concentration.
        `plotfile` (str)
            Name of created plot file showing the fitted
            curves (e.g., PDF or PNG)
        `fixbottom`
            Same meaning as for :class:`fourParamLogistic`
        `fixtop`
            Same meaning as for :class:`fourParamLogistic`
        `xlabel` (str)
            x-label on plot
        `ylabel` (str)
            y-label on plot
        `maxcol` (int)
            Max number of columns in faceted plot
        `ic50_in_name` (bool)
            Same meaning as for :meth:`fourParamLogistic.plot`
        `nfitpoints` (int)
            Same meaning as for :meth:`fourParamLogistic.plot`
        `cextend` (float)
            Same meaning as for :meth:`fourParamLogistic.plot`

    Returns:
        A dictionary keyed by each sample in `neutdata`,
        and with the values being the :class:`fourParamLogistic`
        fitted object for that sample. You can use these
        objects to get detailed information about the fits.

    Here is an example of plotting three curves in a 
    single row (we keep the default `maxcol=3`):

    >>> neutdata = pandas.read_csv(io.StringIO(
    ... '''concentration K280A-1 K280A-2 wildtype
    ...           0.0002  1.2049  1.2140  1.0549
    ...           0.0005  1.0026  1.0107  0.9337
    ...           0.0011  1.1422  1.1173  0.8025
    ...           0.0026  0.9066  0.8368  0.4267
    ...           0.0061  0.6952  0.8242  0.1769
    ...           0.0142  0.3907  0.4503  0.0234
    ...           0.0331  0.0714  0.0660  0.0001
    ...           0.0771 -0.0119 -0.0157 -0.0146
    ...           0.1800 -0.0311 -0.0333 -0.0377
    ...           0.4199 -0.0344 -0.0320 -0.0367
    ...           0.9797 -0.0365 -0.0382 -0.0346
    ...           2.2861 -0.0291 -0.0345 -0.0362'''),
    ...         delim_whitespace=True, index_col=False)
    >>> plotfile = '_fit_fourParamLogistics_neutplot.png'
    >>> neutcurves = fit_fourParamLogistics(neutdata, plotfile)

    The plot looks like this: 

    .. image:: _static/_fit_fourParamLogistics_neutplot.png
       :width: 6in
       :align: center

    We can access the IC50 values and other fitted parameters via
    the returned :class:`fourParamLogistic` objects. For instance:

    >>> print('IC50 for K280A-1 is {0:.4f}'.format(
    ...         neutcurves['K280A-1'].ic50()))
    IC50 for K280A-1 is 0.0105

    """
    assert len(neutdata.columns) == len(set(neutdata.columns)),\
            "columns in `neutdata` not unique"
    assert 'concentration' in neutdata.columns, \
            "`neutdata` does not have column named `concentration`"
    samples = [col for col in neutdata.columns if col != 'concentration']
    cs = neutdata['concentration']

    neutcurves = {}
    neut_dfs = []
    for s in samples:
        try:
            neutcurves[s] = fourParamLogistic(cs, neutdata[s],
                    fixbottom=fixbottom, fixtop=fixtop)
            neut_dfs.append(neutcurves[s]._neutdata(s,
                    ic50_in_name, nfitpoints, cextend))
        except Exception as e:
            raise RuntimeError("Error fitting curve for "
                    "sample {0}.\n\nError:\n{1}".format(
                    s, str(e)))

    dms_tools2.plot.plotFacetedNeutCurves(
            pandas.concat(neut_dfs),
            plotfile,
            xlabel,
            ylabel,
            maxcol=maxcol)

    return neutcurves



[docs]class fourParamLogistic:
    """A fitted 4-parameter logistic neutralization curve.

    Fits :math:`f(c) = b + \\frac{t - b}{1 + (c/m)^s}`
    where :math:`f(c)` is the fraction surviving at
    concentration :math:`c`, :math:`m` is the midpoint
    of the neutralization curve, :math:`t` is the top
    value (e.g., 1), :math:`b` is the bottom value (e.g., 0),
    and :math:`s` is the slope of the curve.

    This documentation is written for the case when 
    :math:`f(c)` is the fraction **surviving**, meaning that
    :math:`f(c)` gets smaller as :math:`c` gets larger.
    This should lead you to fit :math:`s > 0`.
    If you instead use :math:`f(c)` as the fraction neutralized,
    you will fit :math:`s < 1`. In this case, the interpretation
    of `top` and `bottom` will be reversed. However, this class
    has **not** been extensively tested in that scenario, and
    so may not work completely correctly if you have
    :math:`f(c)` represent the fraction neutralized.

    To initialize and fit an object, provide the following:
        `cs` (array-like)
            Concentrations for which we have measurements
            to fit.
        `fs` (array-like)
            Same length as `cs`, with `fs[i]` giving
            fraction surviving at `cs[i]`.
        `fixbottom` (bool)
            If `False`, we fit the bottom value of the
            curve. If you instead want to fix it,
            provide the number to fix it to (typically 0).
            Fix this number if you think that the neutralization
            goes to completion at high enough antibody.
            concentration. For some viruses (such as HIV),
            the neutralization never goes to completion as 
            there are some resistant viruses (such as due
            to glycan heterogeneity) -- if that is the case,
            you want to fit the bottom rather than fix it to 0.
        `fixtop` (`False` or a float)
            If `False`, we fit the top value of the
            curve. If you instead want to fix it,
            provide the number to fix it to (typically 1).
            Usually you do **not** want to fit this value,
            as the fraction surviving should always be one
            at sufficiently low antibody concentration.

    After initialization, has the following attributes:
        `cs` (numpy array)
            Array of the concentration measurements we have
            fitted, sorted from lowest to highest. 
        `fs` (numpy array)
            Array of the fraction surviving measurements
            corresponding to the entries in `cs`.
        `midpoint` (float)
            Midpoint of curve, :math:`m` in equation above.
            Note that the midpoint may **not** be the same
            as the :meth:`ic50`
        `slope` (float)
            Hill slope of curve, :math:`s` in equation above.
        `bottom` (float)
            Bottom of curve (value as :math:`c` gets large),
            :math:`b` in equation above.
        `top` (float)
            Top of curve (value as :math:`c` get small),
            :math:`t` in equation above.

    Use the :meth:`ic50` function to get the fitted IC50.

    As an example, we first simulate some data with known
    parameter values:

    >>> m = 0.03
    >>> s = 1.9
    >>> b = 0.1
    >>> t = 1.0
    >>> cs = [0.002 * 2**x for x in range(9)]
    >>> fs = [fourParamLogistic.evaluate(c, m, s, b, t) for c in cs]

    Now we fit to these data, and then confirm that the
    fitted values are close to the ones used for the
    simulation:

    >>> neut = fourParamLogistic(cs, fs)
    >>> scipy.allclose(neut.midpoint, m)
    True
    >>> scipy.allclose(neut.slope, s)
    True
    >>> scipy.allclose(neut.top, t)
    True
    >>> scipy.allclose(neut.bottom, b)
    True

    Note how since we fit the curve to simulated data where
    the bottom was 0.1 rather than 0, the midpoint and IC50
    are different. Specifically, the IC50 is larger than the
    midpoint, as you have to go past the midpoint to get
    down to value of 0.5 fraction surviving.

    >>> neut.ic50() > neut.midpoint
    True
    >>> scipy.allclose(neut.ic50(), 0.0337385586)
    True
    >>> scipy.allclose(0.5, neut.fracsurvive(neut.ic50()))
    True
    >>> neut.fracsurvive(neut.midpoint) > 0.5
    True

    Here is a plot of the curve and data points:

    >>> neut.plot('_fourParamLogistic_neutplot.png', 'example')

    .. image:: _static/_fourParamLogistic_neutplot.png
       :width: 2.4in
       :align: center

    Now here is an example where we constrain both the top
    and the bottom (to 1 and 0, respectively) and fit
    the curve. Now the midpoint and IC50 are the same:

    >>> b2 = 0
    >>> t2 = 1
    >>> fs2 = [fourParamLogistic.evaluate(c, m, s, b2, t2) for c in cs]
    >>> neut2 = fourParamLogistic(cs, fs2, fixbottom=b2)
    >>> scipy.allclose(neut2.midpoint, m)
    True
    >>> scipy.allclose(neut2.ic50(), m)
    True

    Now let's fit to concentrations that are all **less**
    than the midpoint, so that we never get anywhere
    close to complete neutralization. In this case,
    the estimated IC50 is unreliable, and so will be
    returned as `None`:
    
    >>> cs3 = [1e-5 * 2**x for x in range(7)]
    >>> (cs3[-1] < m)
    True
    >>> fs3 = [fourParamLogistic.evaluate(c, m, s, b2, t2) for c in cs3]
    >>> neut3 = fourParamLogistic(cs3, fs3, fixbottom=b2)
    >>> neut3.ic50() is None
    True

    However, we can still see a limit on the IC50
    using the :meth:`ic50_str` method. As seen below,
    this limit indicates that the IC50 exceeds the
    largest value in the concentrations used to fit
    the curve:

    >>> cs3[-1]
    0.00064
    >>> neut3.ic50_str()
    '$>0.00064$'

    """

    def __init__(self, cs, fs, fixbottom=False, fixtop=1):
        """See main class docstring."""
        # get data into arrays sorted by concentration
        self.cs = scipy.array(cs)
        self.fs = scipy.array(fs)
        self.fs = self.fs[self.cs.argsort()]
        self.cs = self.cs[self.cs.argsort()]

        # make initial guess for slope to have the right sign
        if self.fs[0] >= self.fs[-1]:
            self.slope = 1.5
        else:
            self.slope = -1.5

        # make initial guess for top and bottom
        if fixtop is False:
            if self.slope > 0:
                self.top = self.fs.max()
            else:
                self.top = self.fs.min()
        else:
            assert isinstance(fixtop, (int, float))
            self.top = fixtop
        if fixbottom is False:
            if self.slope > 0:
                self.bottom = self.fs.min()
            else:
                self.bottom = self.fs.max()
        else:
            assert isinstance(fixbottom, (int, float))
            self.bottom = fixbottom

        # make initial guess for midpoint
        midval = (self.top - self.bottom) / 2.0
        if (self.fs > midval).all():
            if self.slope > 0:
                self.midpoint = self.cs[-1]
            else:
                self.midpoint = self.cs[0]
        elif (self.fs <= midval).all():
            if self.slope > 0:
                self.midpoint = self.cs[0]
            else:
                self.midpoint = self.cs[-1]
        else:
            # get first index where f crosses midpoint
            i = scipy.argmax((self.fs > midval)[:-1] !=
                    (self.fs > midval)[1:])
            assert (self.fs[i] > midval) != (self.fs[i + 1] > midval)
            self.midpoint = (self.cs[i] + self.cs[i + 1]) / 2.0

        # set up function and initial guesses
        if fixtop is False and fixbottom is False:
            initguess = [self.midpoint, self.slope, self.bottom, self.top]
            func = self.evaluate
        elif fixtop is False:
            initguess = [self.midpoint, self.slope, self.top]
            def func(c, m, s, t):
                return self.evaluate(c, m, s, self.bottom, t)
        elif fixbottom is False:
            initguess = [self.midpoint, self.slope, self.bottom]
            def func(c, m, s, b):
                return self.evaluate(c, m, s, b, self.top)
        else:
            initguess = [self.midpoint, self.slope]
            def func(c, m, s):
                return self.evaluate(c, m, s, self.bottom, self.top)

        (popt, pcov) = scipy.optimize.curve_fit(
                func,
                self.cs,
                self.fs,
                initguess
                )

        if fixtop is False and fixbottom is False:
            (self.midpoint, self.slope, self.top, self.bottom) = popt
        elif fixtop is False:
            (self.midpoint, self.slope, self.top) = popt
        elif fixbottom is False:
            (self.midpoint, self.slope, self.bottom) = popt
        else:
            (self.midpoint, self.slope) = popt


[docs]    def ic50(self, extrapolate=False):
        """IC50 value.
        
        Concentration where `fracsurvive` is 0.5. Equals
        `midpoint` if and only if `top = 1` and `bottom = 0`.

        Args:
            `extrapolate` (bool)
                Do we extrapolate IC50 outside of range
                of data we fit? (Generally a bad idea.)

        Returns:
            A number giving the IC50 if the fitted value
            is in the range of concentrations used
            to fit the curve. Otherwise returns `None`
            unless `extrapolate` is `True`. Will still
            return `None` even if `extrapolate` is `True`
            if the fit parameters are such that the
            extrapolated curve never reaches 0.5.

        Calculated from:
        :math:`0.5 = b + \\frac{t - b}{1 + (ic50/m)^s}`,
        which solves to 
        :math:`ic50 = m \\times \left(\\frac{t - 0.5}{0.5 - b}\\right)^{1/s}`
        """
        if ((self.slope == 0) or (self.top == self.bottom) or
                ((self.top > 0.5) == (self.bottom > 0.5))):
            return None # extrapolated curve never hits 0.5
        ic50 = (self.midpoint * ((self.top - 0.5) / 
                (0.5 - self.bottom))**(1.0 / self.slope))
        if (self.cs[0] <= ic50 <= self.cs[-1]) or extrapolate:
            return ic50
        else:
            return None


[docs]    def ic50_str(self):
        """Returns string containing IC50.

        Unlike :meth:`ic50`, it returns a string.
        This is useful because the string will indicate
        if the extrapolated IC50 is outside the 
        range of fitted data, such as by being: `> 0.5`.
        """

        def _sciNot(x):
            """Get number in LaTex scientific notation."""
            return dms_tools2.plot.latexSciNot([x])[0]

        ic50 = self.ic50(extrapolate=True)
        if ic50 is None:
            return 'NA'
        elif ic50 > self.cs[-1]:
            return '$>{0}'.format(_sciNot(self.cs[-1])[1 : ])
        elif ic50 < self.cs[0]:
            return '$<{0}'.format(_sciNot(self.cs[0])[1 : ])
        else:
            return _sciNot(ic50)


[docs]    def fracsurvive(self, c):
        """Fraction surviving at `c` for fitted parameters."""
        return self.evaluate(c, self.midpoint, self.slope,
                self.bottom, self.top)


[docs]    @staticmethod
    def evaluate(c, m, s, b, t):
        """Returns :math:`f(c) = b + \\frac{t - b}{1 + (c/m)^s}`."""
        return b + (t - b) / (1 + (c / m)**s)


    def _neutdata(self, samplename, ic50_in_name,
            nfitpoints, cextend):
        """Gets data for plotting neutralization curve.

        Returns a `pandas.DataFrame` appropriate for passing
        to :func:`dms_tools2.plot.plotFacetedNeutCurves`. The
        arguments have the meanings explained in :meth:`plot`.
        """
        concentrations = scipy.concatenate(
                [self.cs,
                 scipy.logspace(math.log10(self.cs.min() / cextend),
                                math.log10(self.cs.max() * cextend),
                                num=nfitpoints)
                ])
        n = len(concentrations)

        points = scipy.concatenate(
                [self.fs,
                 scipy.full(n - len(self.fs), scipy.nan)])

        fit = scipy.array([self.fracsurvive(c) for c in concentrations])

        if ic50_in_name:
            samplename += ' (IC50 = {0})'.format(self.ic50_str())

        return pandas.DataFrame.from_dict(collections.OrderedDict([
                ('concentration', concentrations),
                ('sample', [samplename] * n),
                ('points', points),
                ('fit', fit),
                ]))


[docs]    def plot(self, plotfile, samplename,
            ic50_in_name=True,
            xlabel='antibody concentration',
            ylabel='fraction surviving',
            nfitpoints=100, cextend=2):
        """Plots neutralization points and fitted curve.

        Args:
            `plotfile` (str)
                Name of plot to create (e.g., PDF or PNG).
            `samplename` (str)
                Sample name (put at top of plot)
            `ic50_in_name` (bool)
                Include IC50 at top of plot with sample name?
            `xlabel` (str)
                x-axis label
            `ylabel` (str)
                y-axis label
            `nfitpoints` (int)
                Number of points used to draw fit line.
            `cextend` (float)
                Concentrations extend this much fold
                below / above the min / max values used
                for fitting.
        """
        dms_tools2.plot.plotFacetedNeutCurves(
                self._neutdata(
                        samplename,
                        ic50_in_name,
                        nfitpoints,
                        cextend),
                plotfile,
                xlabel,
                ylabel)



if __name__ == '__main__':
    import doctest
    doctest.testmod()
Source code for dms_tools2.neutcurve

dms_tools2

Navigation

Related Topics