Source code for dms_tools2.compareprefs

"""
========================
compareprefs
========================

Compare site-specific amino-acid preferences among homologs.
"""

import math
import tempfile
import io
import functools
import itertools

import natsort
import numpy
import pandas

import dms_tools2


[docs]def divJensenShannon(p1, p2):
    """Jensen-Shannon divergence between two distributions.

    The logarithms are taken to base 2, so the result will be
    between 0 and 1.

    Args:
        `p1` and `p2` (array-like)
            The two distributions for which we compute divergence.

    Returns:
        The Jensen-Shannon divergence as a float.

    >>> p1 = [0.5, 0.2, 0.2, 0.1]
    >>> p2 = [0.4, 0.1, 0.3, 0.2]
    >>> p3 = [0.0, 0.2, 0.2, 0.6]
    >>> numpy.allclose(divJensenShannon(p1, p1), 0, atol=1e-5)
    True
    >>> numpy.allclose(divJensenShannon(p1, p2), 0.035789, atol=1e-5)
    True
    >>> numpy.allclose(divJensenShannon(p1, p3), 0.392914, atol=1e-5)
    True
    """
    p1 = numpy.array(p1)
    p2 = numpy.array(p2)

    def _kldiv(a, b):
        with numpy.errstate(all='ignore'):
            kl = numpy.sum([v for v in a * numpy.log2(a / b)
                    if not numpy.isnan(v)])
        return kl

    m = 0.5 * (p1 + p2)

    return 0.5 * (_kldiv(p1, m) +_kldiv(p2, m))


[docs]def computeRMS(v):
    """The root mean square (RMS) of a list of values
    
    Args:
        `v` (array-like)
            Values for which we compute the RMS.
            
    Returns:
        The RMS of the values.

    >>> v = [1.2, 3.5, 6.8, 1.1]
    >>> numpy.allclose(computeRMS(v), 3.9096, atol=1e-4)
    True
    """
    v = numpy.array(v)
    assert len(v) > 0
    return math.sqrt((v**2).sum() / len(v))


[docs]def prefDistance(pi1, pi2, distmetric):
    """Computes the distance between two arrays of preferences.
    
    Args:
        `pi1` and `pi2` (array-like)
            Two arrays of preferences.
        `distmetric` (string)
            Distance metric to use. Can be:
                - `half_sum_abs_diff`: half sum absolute value of difference
                - `JensenShannon`: square root of Jensen-Shannon divergence
            
    Returns:
        The distance between `pi1` and `pi2`.
        
    >>> pi1 = [0.5, 0.2, 0.3]
    >>> pi2 = [0.2, 0.4, 0.4]
    >>> numpy.allclose(prefDistance(pi1, pi1, 'half_sum_abs_diff'), 0)
    True
    >>> numpy.allclose(prefDistance(pi1, pi1, 'JensenShannon'), 0)
    True
    >>> numpy.allclose(prefDistance(pi1, pi2, 'half_sum_abs_diff'), 0.3)
    True
    >>> numpy.allclose(prefDistance(pi1, pi2, 'JensenShannon'), 0.2785483)
    True
    """
    pi1 = numpy.array(pi1)
    pi2 = numpy.array(pi2)
    assert len(pi1) == len(pi2)
    assert numpy.allclose(pi1.sum(), 1, atol=0.005)
    assert numpy.allclose(pi2.sum(), 1, atol=0.005)
    assert numpy.all(pi1 >= 0)
    assert numpy.all(pi2 >= 0)

    if distmetric == 'half_sum_abs_diff':
        dist = (numpy.fabs(pi1 - pi2)).sum() / 2.0

    elif distmetric == 'JensenShannon':
        dist = math.sqrt(divJensenShannon(pi1, pi2))

    else:
        raise ValueError('Invalid `distmetric` {0}'.format(distmetric))

    return dist


[docs]def comparePrefs(prefs1, prefs2, sites=None, distmetric='half_sum_abs_diff',
        chars=dms_tools2.AAS):
    """Compute error-corrected distance between two sets of preferences.
  
    Designed for the situation in which you have made replicate
    measurements of the amino-acid preferences for two protein
    homologs, and want to estimate the difference in preferences
    at each site while correcting for experimental error as
    quantified by the replicate measurements.

    The *distance* between each pair of replicates at each
    site is computed using `prefDistance` with `distmetric`.
    We then compute the RMS distance between all pairs
    for the same homolog to get `RMSDwithin`, and all pairs
    of different homologs to get `RMSDbetween`. We calculate
    `RMSDcorrected` as `RMSDbetween - RMSDwithin`.

    We also compute the mean (across replicates) preference
    for homolog 1 minus the mean for homolog 2, scaled so
    that the total height in each direction equals `RMSDcorrected`.
    These values are an error-corrected estimate of the difference
    in preference for each amino acid between homologs.

    Args:
        `prefs1` (list)
            Files giving replicate measurements of preferences for
            homolog 1 in the CSV format returned by ``dms2_prefs``.
        `prefs2` (list)
            Files giving measurements for homolog 2.
        `sites` (list or `None`)
            If `None`, compare all sites shared between the two
            homolog preference sites. Otherwise should be a list
            of the sites to compare.
        `distmetric` (string)
            Distance metric to use. Can be any valid option for
            the argument of the same name to `prefDistance`.
        `chars` (list)
            List of characters for which we analyze the preferences.
            For instance, all 20 amino acids.
    
    Returns:
        A `pandas.DataFrame` giving the distances at each site,
        as well as the replicate mean difference between 
        preferences for homolog 1 minus homolog 2 for each amino
        acid at each site scaled to height of `RMSDcorrected`
        in each direction.

    Example calculation for two character sequences and two
    replicates for each homolog:

    >>> TF = functools.partial(tempfile.NamedTemporaryFile, mode='w')
    >>> with TF() as p1_1, TF() as p1_2, TF() as p2_1, TF() as p2_2:
    ...     n = p1_1.write('''site,    A,    C
    ...                          1,  0.8,  0.2
    ...                          2,  0.3,  0.7'''.replace(' ', ''))
    ...     p1_1.flush()
    ...     n = p1_2.write('''site,    A,    C
    ...                          1,  0.8,  0.2
    ...                          2,  0.4,  0.6'''.replace(' ', ''))
    ...     p1_2.flush()
    ...     n = p2_1.write('''site,    A,    C
    ...                          2,  0.4,  0.6
    ...                          1,  0.6,  0.4'''.replace(' ', ''))
    ...     p2_1.flush()
    ...     n = p2_2.write('''site,    A,    C
    ...                          1,  0.6,  0.4
    ...                         1a,  0.4,  0.6
    ...                          2,  0.5,  0.5'''.replace(' ', ''))
    ...     p2_2.flush()
    ...     diffs = comparePrefs([p1_1.name, p1_2.name],
    ...                          [p2_1.name, p2_2.name],
    ...                          chars=['A', 'C'])
    >>> print(diffs.to_string(float_format=lambda x: '{0:.2f}'.format(x)))
      site  RMSDcorrected  RMSDbetween  RMSDwithin     A     C
    0    1           0.20         0.20        0.00  0.20 -0.20
    1    2           0.02         0.12        0.10 -0.02  0.02
    """

    assert len(prefs1) > 1, "provide prefs for multiple replicates"
    assert len(prefs2) > 1, "provide prefs for multiple replicates"
  
    # read in all preferences
    prefs = []
    expectcols = ['site'] + chars
    for (homolog, homologprefs) in enumerate([prefs1, prefs2], 1):
        for (rep, repprefs) in enumerate(homologprefs, 1):
            iprefs = pandas.read_csv(repprefs)
            iprefs['site'] = iprefs['site'].astype('str')
            assert set(iprefs.columns) <= set(expectcols), \
                    "{0} missing expected columns".format(repprefs)
            prefs.append(iprefs[expectcols]
                               .assign(homolog=homolog, replicate=rep)
                               )

    # get only desired sites
    if sites is None:
        # use sites shared among all preference sets
        sites = list(set.intersection(*[set(p['site']) for p in prefs]))
    assert isinstance(sites, list) and len(sites), "no `sites` to analyze"
    sites = natsort.realsorted(list(map(str, sites)))

    # merge preferences for desired sites
    assert all([set(p['site']) >= set(sites) for p in prefs]),\
            "not all prefs have all sites"
    prefs = [p[p['site'].isin(sites)] for p in prefs]
    prefs = pandas.concat(prefs)
    prefs['site'] = pandas.Categorical(prefs['site'], sites)
    prefs = prefs.sort_values('site').set_index('site')

    # compute RMSDs
    dists = {'within':[], 'between':[]}
    for ((hi, repi), (hj, repj)) in itertools.combinations(
            [(h, rep) for h in [1, 2] for rep in
            prefs.query('homolog == @h')['replicate'].unique()], 2):
        prefsi = (prefs.query('homolog == @hi and replicate == @repi')
                  [chars])
        prefsj = (prefs.query('homolog == @hj and replicate == @repj')
                  [chars])
        assert prefsi.index.equals(prefsj.index)
        disttype = {True:'within', False:'between'}[hi == hj]
        dists[disttype].append(
                [prefDistance(prefsi.loc[r], prefsj.loc[r], distmetric)
                for r in sites]
                )
    for (disttype, dist) in dists.items():
        distseries = (pandas.DataFrame(dist, columns=sites)
                      .transpose()
                      .apply(computeRMS, axis=1)
                      )
        prefs['RMSD' + disttype] = distseries
    prefs['RMSDcorrected'] = prefs['RMSDbetween'] - prefs['RMSDwithin']
    rmsds = ['RMSDcorrected', 'RMSDbetween', 'RMSDwithin']

    # compute RMSDcorrected-scaled diff between homologs for each pref
    prefmeans = {}
    for homolog in [1, 2]:
        prefmeans[homolog] = (prefs.reset_index()
                              .query('homolog == @homolog')
                              .groupby('site')
                              [chars]
                              .mean()
                              )
    prefs = prefs[~prefs.index.duplicated(keep='first')][rmsds]
    dprefs = prefmeans[1] - prefmeans[2]
    # normalize so sums to one in each direction
    dprefs = dprefs.div(dprefs.abs().sum(axis=1), axis=0).mul(2).fillna(0)
    dprefs = dprefs.mul(prefs['RMSDcorrected'], axis=0)
    prefs = prefs.join(dprefs)

    return prefs[rmsds + chars].reset_index()


if __name__ == '__main__':
    import doctest
    doctest.testmod()
Source code for dms_tools2.compareprefs

dms_tools2

Navigation

Related Topics