Source code for dms_tools2.prefs

"""
========================
prefs
========================

Performs operations related to estimating site-specific amino-acid
preferences.

Uses `pystan <https://pystan.readthedocs.io/en/latest>`_
to perform MCMC for Bayesian inferences.
"""


import time
import math
import tempfile
import pickle
import random
import collections

import natsort
import numpy
import numpy.random
import pandas
import Bio.SeqIO
import pystan

import dms_tools2

#: minimum value for Dirichlet prior elements
PRIOR_MIN_VALUE = 1.0e-7 


[docs]class StanModelNoneErr(object):
    """``pystan`` model when `error_model` is `none`.
    
    For use by inferSitePrefs`."""
    def __init__(self, verbose=False):
        """Compile ``pystan`` model.

        Args:
            `verbose` (bool)
                Set to `True` if you want verbose compilation.
        """
        self.pystancode =\
"""
data {{
    int<lower=1> Nchar; // 64 for codons, 20 for amino acids, 4 for nucleotides
    int<lower=0> nrpre[Nchar]; // counts pre-selection
    int<lower=0> nrpost[Nchar]; // counts post-selection
    vector<lower={0:g}>[Nchar] pir_prior_params; // Dirichlet prior params
    vector<lower={0:g}>[Nchar] mur_prior_params; // Dirichlet prior params
}}
parameters {{
    simplex[Nchar] pir;
    simplex[Nchar] mur;
}}
transformed parameters {{
    simplex[Nchar] fr;
    fr = pir .* mur / dot_product(pir, mur);
}}
model {{
    pir ~ dirichlet(pir_prior_params);
    mur ~ dirichlet(mur_prior_params);
    nrpre ~ multinomial(mur);
    nrpost ~ multinomial(fr);
}}
""".format(PRIOR_MIN_VALUE)
        self.model = pystan.StanModel(model_code=self.pystancode, 
                verbose=verbose)


[docs]class StanModelSameErr:
    """``pystan`` model when `error_model` is `same`.
    
    For use by inferSitePrefs`."""
    def __init__(self, verbose=False):
        """Compile ``pystan`` model.

        Args:
            `verbose` (bool)
                Set to `True` if you want verbose compilation.
        """
        self.pystancode =\
"""
data {{
    int<lower=1> Nchar; // 64 for codons, 20 for amino acids, 4 for nucleotides
    int<lower=1, upper=Nchar> iwtchar; // index of wildtype character in 1, ... numbering
    int<lower=0> nrpre[Nchar]; // counts pre-selection
    int<lower=0> nrpost[Nchar]; // counts post-selection
    int<lower=0> nrerr[Nchar]; // counts in error control
    vector<lower={0:g}>[Nchar] pir_prior_params; // Dirichlet prior params
    vector<lower={0:g}>[Nchar] mur_prior_params; // Dirichlet prior params
    vector<lower={0:g}>[Nchar] epsilonr_prior_params; // Dirichlet prior params
}}
transformed data {{
    simplex[Nchar] deltar;
    for (ichar in 1:Nchar) {{
        deltar[ichar] = 0.0;
    }}
    deltar[iwtchar] = 1.0;
}}
parameters {{
    simplex[Nchar] pir;
    simplex[Nchar] mur;
    simplex[Nchar] epsilonr;
}}
transformed parameters {{
    simplex[Nchar] fr_plus_err;
    simplex[Nchar] mur_plus_err;
    fr_plus_err = pir .* mur / dot_product(pir, mur) + epsilonr - deltar;
    mur_plus_err = mur + epsilonr - deltar;
}}
model {{
    pir ~ dirichlet(pir_prior_params);
    mur ~ dirichlet(mur_prior_params);
    epsilonr ~ dirichlet(epsilonr_prior_params);
    nrerr ~ multinomial(epsilonr);
    nrpre ~ multinomial(mur_plus_err);
    nrpost ~ multinomial(fr_plus_err);
}}
""".format(PRIOR_MIN_VALUE)
        self.model = pystan.StanModel(model_code=self.pystancode,
                verbose=verbose)


[docs]class StanModelDifferentErr:
    """``pystan`` model when `error_model` is `different`.
    
    For use by inferSitePrefs`."""
    def __init__(self, verbose=False):
        """Compile ``pystan`` model.

        Args:
            `verbose` (bool)
                Set to `True` if you want verbose compilation.
        """
        self.pystancode =\
"""
data {{
    int<lower=1> Nchar; // 64 for codons, 20 for amino acids, 4 for nucleotides
    int<lower=1, upper=Nchar> iwtchar; // index of wildtype character in 1, ... numbering
    int<lower=0> nrpre[Nchar]; // counts pre-selection
    int<lower=0> nrpost[Nchar]; // counts post-selection
    int<lower=0> nrerrpre[Nchar]; // counts in pre-selection error control
    int<lower=0> nrerrpost[Nchar]; // counts in post-selection error control
    vector<lower={0:g}>[Nchar] pir_prior_params; // Dirichlet prior params
    vector<lower={0:g}>[Nchar] mur_prior_params; // Dirichlet prior params
    vector<lower={0:g}>[Nchar] epsilonr_prior_params; // Dirichlet prior params
    vector<lower={0:g}>[Nchar] rhor_prior_params; // Dirichlet prior params
}}
transformed data {{
    simplex[Nchar] deltar;
    for (ichar in 1:Nchar) {{
        deltar[ichar] = 0.0;
    }}
    deltar[iwtchar] = 1.0;
}}
parameters {{
    simplex[Nchar] pir;
    simplex[Nchar] mur;
    simplex[Nchar] epsilonr;
    simplex[Nchar] rhor;
}}
transformed parameters {{
    simplex[Nchar] fr_plus_err;
    simplex[Nchar] mur_plus_err;
    fr_plus_err = pir .* mur / dot_product(pir, mur) + rhor - deltar;
    mur_plus_err = mur + epsilonr - deltar;
}}
model {{
    pir ~ dirichlet(pir_prior_params);
    mur ~ dirichlet(mur_prior_params);
    epsilonr ~ dirichlet(epsilonr_prior_params);
    rhor ~ dirichlet(rhor_prior_params);
    nrerrpre ~ multinomial(epsilonr);
    nrerrpost ~ multinomial(rhor);
    nrpre ~ multinomial(mur_plus_err);
    nrpost ~ multinomial(fr_plus_err);
}}
""".format(PRIOR_MIN_VALUE)
        self.model = pystan.StanModel(model_code=self.pystancode,
                verbose=verbose)


def _initialValuePrefs(error_model, nchains, iwtchar, nchars):
    """Gets valid initial values for ``pystan`` preference inference.

    Values initialized by ``pystan`` frequently have invalid values.
    This function will generate random values that are valid
    for initialization and return a list that can be passed to the 
    `StanModel` as the `init` argument.
    """
    initattempts = 10 # might fail for pathological random values
    nrescales = 10 # rescale non-wildtype values down this many times
    rescalefactor = 5.0 # rescale non-wildtype values down by this much
    deltar = numpy.zeros(nchars)
    deltar[iwtchar] = 1.0
    init = []
    concparams = [1.0 for ichar in range(nchars)]
    concparams[iwtchar] = 10.0 # make this element bigger
    if error_model == 'none':
        for chain in range(nchains):
            while True:
                chain_init = {
                        'pir':numpy.random.dirichlet(concparams),
                        'mur':numpy.random.dirichlet(concparams),
                        }
                if (all(chain_init['pir'] > PRIOR_MIN_VALUE) and 
                        all(chain_init['mur'] > PRIOR_MIN_VALUE)):
                    init.append(chain_init)
                    break
        return init
    elif error_model == 'same':
        posterrname = 'epsilonr'
    elif error_model == 'different':
        posterrname = 'rhor'
    else:
        raise ValueError("Invalid error_model {0}".format(error_model))
    for chain in range(nchains):
        for iattempt in range(initattempts):
            chain_init = {
                    'pir':numpy.random.dirichlet(concparams),
                    'mur':numpy.random.dirichlet(concparams),
                    'epsilonr':numpy.random.dirichlet(concparams),
                    }
            if error_model == 'different':
                chain_init['rhor'] = numpy.random.dirichlet(concparams)
            irescale = 0
            while irescale < nrescales and (any(
                    chain_init['pir'] * chain_init['mur'] / 
                    numpy.dot(chain_init['pir'], chain_init['mur']) 
                    + chain_init[posterrname] - deltar < PRIOR_MIN_VALUE)
                    or any(chain_init['mur'] + chain_init['epsilonr'] 
                    - deltar < PRIOR_MIN_VALUE)): 
                irescale += 1
                chain_init['epsilonr'] /= rescalefactor
                chain_init['epsilonr'][iwtchar] = 1.0 - sum(
                        [chain_init['epsilonr'][ichar] for ichar 
                        in range(nchars) if ichar != iwtchar])
                if error_model == 'different':
                    chain_init['rhor'] /= rescalefactor
                    chain_init['rhor'][iwtchar] = 1.0 - sum([
                            chain_init['rhor'][ichar] for ichar in 
                            range(nchars) if ichar != iwtchar])
            if (all(chain_init['pir'] * chain_init['mur'] / 
                    numpy.dot(chain_init['pir'], chain_init['mur']) + 
                    chain_init[posterrname] - deltar > PRIOR_MIN_VALUE) 
                    and all(chain_init['mur'] + chain_init['epsilonr'] - 
                    deltar > PRIOR_MIN_VALUE)):
                break
        else:
            raise ValueError("Failed to initialize")
        init.append(chain_init)
    return init


[docs]def inferPrefsByRatio(charlist, sites, wts, pre, post, errpre,
        errpost, pseudocount):
    r"""Site-specific preferences from normalized enrichment ratios.

    Calculates site-specific preference :math:`\pi_{r,a}` of each
    site :math:`r` for each character :math:`a` using re-normalized
    enrichment ratios. This function accomplishes the same goal as
    `inferSitePrefs`, but is much simpler and faster, although less
    statistically rigorous in principle.

    Specifically, this function calculates the preferences as

    .. math::

        \pi_{r,a} = \frac{\phi_{r,a}}{\sum_{a'} \phi_{r,a'}}

    where :math:`\phi_{r,a}` is the enrichment ratio of character
    :math:`a` relative to wildtype after selection. 
    
    The actual definition of these enrichment ratios is somewhat
    complex due to the need to correct for errors and add a
    pseudocount. We have 4 sets of counts:

        - `pre`: pre-selection counts

        - `post`: post-selection counts

        - `errpre`: error-control for pre-selection counts

        - `errpost`: error-control for post-selection counts

    For each set of counts :math:`s`, let :math:`N_r^s` (e.g., 
    :math:`N_r^{pre}`) denote the total counts for this sample at
    site :math:`r`, and let :math:`n_{r,a}^{s}` (e.g., 
    :math:`n_{r,a}^{pre}`) denote the counts at that site for
    character :math:`a`.

    Because some of the counts are low, we add a pseudocount 
    :math:`P` to each observation. Importantly, we need to scale
    this pseudocount by the total depth for that sample at that site
    in order to avoid systematically estimating different frequencies
    purely as a function of sequencing depth, which will bias
    the preference estimates.
    Therefore, given the pseudocount value :math:`P` defined via the
    ``--pseudocount`` argument to :ref:`dms2_prefs`, we calculate the
    scaled pseudocount for sample :math:`s` and site :math:`r` as

    .. math::

       P_r^s = P \times \frac{N_r^s}{\min\limits_{s'} N_r^{s'}}.

    This equation makes the pseudocount :math:`P` for the lowest-depth
    sample, and scales up the pseodocounts for all other samples
    proportional to their depth.

    With this pseudocount, the
    estimated frequency of character :math:`a` at site :math:`r`
    in sample :math:`s` is 
    
    .. math::
    
        f_{r,a}^s = \frac{n_{r,a}^s + P_r^s}{N_{r}^s + A \times P_r^s}

    where :math:`A` is the number of characters in the alphabet (e.g.,
    20 for amino acids without stop codons).

    The **error-corrected** estimates of the frequency of character
    :math:`a` before and after selection are then

    .. math::

        f_{r,a}^{before} &= \max\left(\frac{P_r^{pre}}{N_{r}^{pre} + A \times P_r^{pre}},
        \; f_{r,a}^{pre} + \delta_{a,\rm{wt}\left(r\right)}
        - f_{r,a}^{errpre}\right)

        f_{r,a}^{after} &= \max\left(\frac{P_r^{post}}{N_{r}^{post} + A \times P_r^{post}},
        \; f_{r,a}^{post} + \delta_{a,\rm{wt}\left(r\right)}
        - f_{r,a}^{errpost}\right)

    where :math:`\delta_{a,\rm{wt}\left(r\right)}` is the 
    Kronecker-delta, equal to 1 if :math:`a` is the same as the
    wildtype character :math:`\rm{wt}\left(r\right)` at site
    :math:`r`, and 0 otherwise. We use the :math:`\max` operator
    to ensure that even if the error-control frequency exceeds
    the actual estimate, our estimated frequency never drops
    below the pseudocount over the depth of the uncorrected sample.

    Given these definitions, we then simply calculate the enrichment 
    ratios as 

    .. math::

        \phi_{r,a} = \frac{\left(f_{r,a}^{after}\right) / 
        \left(f_{r,\rm{wt}\left(r\right)}^{after}\right)}
        {\left(f_{r,a}^{before}\right) / 
        \left(f_{r,\rm{wt}\left(r\right)}^{before}\right)}

    In the case where we are **not** using any error-controls, then
    we simply set 
    :math:`f_{r,\rm{wt}}^{errpre} = f_{r,\rm{wt}}^{errpost} 
    = \delta_{a,\rm{wt}\left(r\right)}`.

    Args:
        `charlist` (list)
            Valid characters (e.g., codons, amino acids, nts).
        `sites` (list)
            Sites to analyze.
        `wts` (list)
            `wts[r]` is the wildtype character at site `sites[r]`.
        `pre` (pandas.DataFrame)
            Gives pre-selection counts. Should have columns
            with names of 'site' and all characters in `charlist`.
            The rows give the counts of each character at that site.
        `post` (pandas.DataFrame)
            Like `pre` but for post-selection counts.
        `errpre` (`None` or pandas.DataFrame)
            Like `pre` but for pre-selection error-control counts,
            or `None` if there is no such control.
        `errpost` (`None` or pandas.DataFrame)
            Like `pre` but for post-selection error-control counts,
            or `None` if there is no such control.
        `pseudocount` (float or int)
            The pseudocount to add to each observation.

    Returns:
        A pandas.DataFrame holding the preferences. The columns of
        this dataframe are 'site' and all characters in `charlist`.
        For each site in `sites`, the rows give the preference
        for that character.
    """
    assert len(wts) == len(sites) > 0
    assert all([wt in charlist for wt in wts]), "invalid char in wts"
    assert pseudocount > 0, "pseudocount must be greater than zero"

    dfs = {'pre':pre.copy(),
           'post':post.copy()
           }
    if errpre is not None:
        dfs['errpre'] = errpre.copy()
    if errpost is not None:
        dfs['errpost'] = errpost.copy()

    # compute total depth for each sample and sort by sites
    depths = []
    for stype in dfs.keys():
        df = dfs[stype]
        assert set(list(charlist) + ['site']) <= set(df.columns)
        assert set(sites) <= set(df['site'])
        df = df.query('site in @sites')
        assert len(df.index) == len(wts) == len(sites)
        df['site'] = pandas.Categorical(df['site'], sites)
        df = df.sort_values('site').reset_index(drop=True)
        df['Nr'] = df[charlist].sum(axis=1).astype('float')
        depths.append(df['Nr'])
        dfs[stype] = df
    mindepth = pandas.concat(depths, axis=1).min(axis=1)

    # calculate scaled pseudocounts
    pseudocounts = dict([(stype,
            pseudocount * (dfs[stype]['Nr'] / mindepth).fillna(1.0))
            for stype in dfs.keys()])

    # calculate frequencies
    for stype in dfs.keys():
        df = dfs[stype]
        for c in charlist:
            df['f{0}'.format(c)] = (df[c] + pseudocounts[stype]) / (
                    df['Nr'] + len(charlist) * pseudocounts[stype]) 
            df['delta{0}'.format(c)] = [int(wt == c) for wt in wts]
        dfs[stype] = df

    # fill values for errpre / errpost if they were None
    for stype in ['pre', 'post']:
        err = 'err{0}'.format(stype)
        if err not in dfs:
            dfs[err] = pandas.DataFrame(dict([('site', sites)] +
                    [('f{0}'.format(c), dfs[stype]['delta{0}'.format(c)])
                    for c in charlist]))

    # compute phi values
    fr = {}
    for (key, f, ferr, p) in [
            ('before', dfs['pre'], dfs['errpre'], pseudocounts['pre']),
            ('after', dfs['post'], dfs['errpost'], pseudocounts['post']),
            ]:
        fr[key] = {'wt':numpy.zeros(len(sites))}
        for c in charlist:
            fr[key][c] = numpy.maximum(
                    p / (f['Nr'].values + len(charlist) * p),
                    f['f{0}'.format(c)].values 
                        + f['delta{0}'.format(c)].values 
                        - ferr['f{0}'.format(c)].values
                    )
            fr[key]['wt'] += fr[key][c] * f['delta{0}'.format(c)].values
    phi = pandas.DataFrame({'site':sites})
    for c in charlist:
        phi[c] = ((fr['after'][c] / fr['after']['wt']) /
                  (fr['before'][c] / fr['before']['wt']))
    phi['denom'] = phi[charlist].sum(axis=1)

    # normalize phi values to get pi values
    prefs = phi[charlist].div(phi['denom'], axis=0)
    prefs.insert(0, 'site', sites)

    return prefs
    

[docs]def inferSitePrefs(charlist, wtchar, error_model, counts, 
        priors, seed=1, niter=10000, increasetries=5, n_jobs=1, 
        r_max=1.1, neff_min=100, nchains=4, increasefac=2):
    r"""Infers site-specific preferences by MCMC for a specific site.

    Infer the site-specific preferences :math:`\pi_{r,a}` for some site
    :math:`r` for each character :math:`a` by integrating over the 
    posterior defined by the product of the following priors and 
    likelihoods, where :math:`\boldsymbol{\mathbf{\pi_r}}` indicates
    the vector of :math:`\pi_{r,a}` values for all characters:

    .. math::

        \Pr\left(\boldsymbol{\mathbf{\pi_r}}\right) 
        &= \mathrm{Dirichlet}\left(\boldsymbol{\mathbf{a_{\pi,r}}}\right)

        \Pr\left(\boldsymbol{\mathbf{\mu_r}}\right) 
        &= \mathrm{Dirichlet}\left(\boldsymbol{\mathbf{a_{\mu,r}}}\right)

        \Pr\left(\boldsymbol{\mathbf{\epsilon_r}}\right) 
        &= \mathrm{Dirichlet}\left(\boldsymbol{\mathbf{a_{\epsilon,r}}}\right)

        \Pr\left(\boldsymbol{\mathbf{\rho_r}}\right) 
        &= \mathrm{Dirichlet}\left(\boldsymbol{\mathbf{a_{\rho,r}}}\right)

        \Pr\left(\boldsymbol{\mathbf{n_r^{\rm{pre}}}} \mid \boldsymbol{\mathbf{\mu_r}}, \boldsymbol{\mathbf{\epsilon_r}}\right) 
        &= \mathrm{Multinomial}\left(\boldsymbol{\mathbf{n_r^{\rm{pre}}}}; \boldsymbol{\mathbf{\mu_r}} + \boldsymbol{\mathbf{\epsilon_r}} - \boldsymbol{\mathbf{\delta_r}}\right)

        \Pr\left(\boldsymbol{\mathbf{n_r^{\rm{post}}}} \mid \boldsymbol{\mathbf{\mu_r}}, \boldsymbol{\mathbf{\epsilon_r}}\right) 
        &= \mathrm{Multinomial}\left(\boldsymbol{\mathbf{n_r^{\rm{post}}}}; \frac{\boldsymbol{\mathbf{\mu_r}} \circ \boldsymbol{\mathbf{\pi_r}}}{\boldsymbol{\mathbf{\mu_r}} \cdot \boldsymbol{\mathbf{\pi_r}}} + \boldsymbol{\mathbf{\rho_r}} - \boldsymbol{\mathbf{\delta_r}}\right)

        \Pr\left(\boldsymbol{\mathbf{n_r^{\rm{err,pre}}}} \mid \boldsymbol{\mathbf{\epsilon_r}}\right) 
        &= \mathrm{Multinomial}\left(\boldsymbol{\mathbf{n_r^{\rm{err,pre}}}}; \boldsymbol{\mathbf{\epsilon_r}}\right)

        \Pr\left(\boldsymbol{\mathbf{n_r^{\rm{err,post}}}} \mid \boldsymbol{\mathbf{\rho_r}}\right) 
        &= \mathrm{Multinomial}\left(\boldsymbol{\mathbf{n_r^{\rm{err,post}}}}; \boldsymbol{\mathbf{\rho_r}}\right)

    where :math:`\boldsymbol{\mathbf{\delta_r}}` is a vector that is 
    zero except for a one at the element corresponding to the wildtype.

    The MCMC tries to guarantee convergence via the parameters specified
    by `r_max`, `neff_min`, `nchains`, `niter`, `increasefac`, 
    and `increasetries`.

    Args:
        `charlist` (list)
           List of valid characters (e.g., codons, amino acids, nts).
        `wtchar` (str)
            Wildtype character at the site.
        `error_model` (str or object)
            Specifies how errors are estimated. Passing error model
            objects is faster if calling this function repeatedly
            as they will not need to be compiled. Can be:

              - The str `none` or instance of `StanModelNoneErr`:
                no errors (:math:`\boldsymbol{\mathbf{\epsilon_r}} = \boldsymbol{\mathbf{\rho_r}} = \mathbf{0}`)

              - The str `same` or instance of `StanModelSameErr`: 
                same error rates  pre and post-selection 
                (:math:`\boldsymbol{\mathbf{\epsilon_r}} = \boldsymbol{\mathbf{\rho_r}}`).

              - The str `different` or instance of `StanModelDifferentErr`: 
                different error rates pre- and post-selection
                (:math:`\boldsymbol{\mathbf{\epsilon_r}} \ne \boldsymbol{\mathbf{\rho_r}}`).

        `counts` (dict)
            Deep sequencing counts. Each string key should specify
            a dict keyed by all characters in `charlist` with
            the values giving integer counts for that character. The keys:

                - `pre`: :math:`\boldsymbol{\mathbf{n_r^{\rm{pre}}}}`

                - `post`: :math:`\boldsymbol{\mathbf{n_r^{\rm{post}}}}`

                - *err*: required if `error_model` is `same`, specifies
                  :math:`\boldsymbol{\mathbf{n_r^{\rm{err,pre}}}} = \boldsymbol{\mathbf{n_r^{\rm{err,post}}}}`

                - `errpre` and `errpost`: required if `error_model` is
                  `different`, specify 
                  :math:`\boldsymbol{\mathbf{n_r^{\rm{err,pre}}}}` and
                  :math:`\boldsymbol{\mathbf{n_r^{\rm{err,post}}}}`.

        `priors` (dict)
            Specifies parameter vectors for Dirichlet priors. Each string
            key should specify a dict keyed by all characters and values
            giving the prior for that character. Values less than 
            `PRIOR_MIN_VALUE` are set to `PRIOR_MIN_VALUE`. Keys are

                - `pir_prior_params`: :math:`\boldsymbol{\mathbf{a_{\pi,r}}}`

                - `mur_prior_params`: :math:`\boldsymbol{\mathbf{a_{\mu,r}}}`

                - `epsilonr_prior_params`: only required if `error_model` is 
                  `same` or `different`, specifies 
                  :math:`\boldsymbol{\mathbf{a_{\epsilon,r}}}`

                - `rhor_prior_params` : only required if `error_model` is 
                  `different`, specifies
                  :math:`\boldsymbol{\mathbf{a_{\rho,r}}}`

        `seed` (int)
            Random number seed for MCMC. 
        `n_jobs` (int)
            Number of CPUs to use, -1 means all available.
        `niter`, `increasetries`, `r_max`, `neff_min`, `nchains`, `increasefac`
            Specify MCMC convergence. They all have reasonable defaults.
            The MCMC is considered to have converged if the mean Gelman-Rubin R
            statistic (http://www.jstor.org/stable/2246093) over all 
            :math:`\pi_{r,x}` values <= `r_max` and the mean effective sample
            size is >= `neff_min`. The MCMC first runs with `nchains` chains
            and `niter` iterations. If it fails to converge, it increases
            the iterations by a factor of `increasefac` and tries again,
            and repeats until it converges or has tried `increasetries`
            times to increase the number of iterations. If the effective
            sample size exceeds 3 times `neff_min` then we allow
            R to be `1 + 1.5 (r_max - 1)`.

    Returns:
        The tuple `(converged, pi_means, pi_95credint, logstring)` where:
            - `converged` is `True` if the MCMC converged, `False` otherwise.
            - `pi_means` is dict keyed by characters in `charlist`
               with the value giving the :math:`\pi_{r,a}`.
            - `pi_95credint` is dict keyed by characters, values are 2-tuples
               giving median-centered credible interval for :math:`\pi_{r,a}`.
            - `logstring` is a string describing MCMC run and convergence.
    """
    logstring = ['\tBeginning MCMC at %s' % time.asctime()]
    random.seed(seed)
    numpy.random.seed(seed)
    assert nchains >= 2, "nchains must be at least two"
    assert niter >= 100, "niter must be at least 100"
    assert len(charlist) == len(set(charlist))
    assert wtchar in charlist
    data = {'Nchar':len(charlist), 
            'iwtchar':charlist.index(wtchar) + 1,
            'nrpre':[counts['pre'][c] for c in charlist],
            'nrpost':[counts['post'][c] for c in charlist],
            'pir_prior_params':[max(PRIOR_MIN_VALUE, 
                    priors['pir_prior_params'][c]) for c in charlist],
            'mur_prior_params':[max(PRIOR_MIN_VALUE, 
                    priors['mur_prior_params'][c]) for c in charlist],
           }
    if error_model == 'none':
        sm = StanModelNoneErr().model
    elif isinstance(error_model, StanModelNoneErr):
        sm = error_model.model
        error_model = 'none'
    elif error_model == 'same' or isinstance(error_model, StanModelSameErr):
        if error_model == 'same':
            sm = StanModelSameErr().model
        else:
            sm = error_model.model
            error_model = 'same'
        data['nrerr'] = [counts['err'][c] for c in charlist]
        data['epsilonr_prior_params'] = [max(PRIOR_MIN_VALUE, 
                priors['epsilonr_prior_params'][c]) for c in charlist]
    elif (error_model == 'different' or 
            isinstance(error_model, StanModelDifferentErr)):
        if error_model == 'different':
            sm = StanModelDifferentErr().model
        else:
            sm = error_model.model
            error_model = 'different'
        data['nrerrpre'] = [counts['errpre'][c] for c in charlist]
        data['nrerrpost'] = [counts['errpost'][c] for c in charlist]
        data['epsilonr_prior_params'] = [max(PRIOR_MIN_VALUE, 
                priors['epsilonr_prior_params'][c]) for c in charlist]
        data['rhor_prior_params'] = [max(PRIOR_MIN_VALUE, 
                priors['rhor_prior_params'][c]) for c in charlist]
    else:
        raise ValueError("Invalid error_model {0}".format(error_model))

    ntry = 0
    while True: # run until converged or tries exhausted
        init = _initialValuePrefs(error_model, nchains, 
                charlist.index(wtchar), len(charlist))
        fit = sm.sampling(data=data, iter=niter, chains=nchains, 
                seed=seed, n_jobs=n_jobs, refresh=-1, init=init)
        # extract output
        fitsummary = fit.summary()
        rownames = list(fitsummary['summary_rownames'])
        colnames = list(fitsummary['summary_colnames'])
        summary = fitsummary['summary']
        char_row_indices = dict([(c, rownames.index('pir[{0}]'.format(
                charlist.index(c) + 1))) for c in charlist]) 
        rindex = colnames.index('Rhat')
        neffindex = colnames.index('n_eff')
        rlist = [summary[char_row_indices[c]][rindex] for c in charlist]
        rhat_is_nan = [rhat for rhat in rlist if math.isnan(rhat)]
        rlist = [rhat for rhat in rlist if not math.isnan(rhat)]
        nefflist = [summary[char_row_indices[c]][neffindex] for c in charlist]
        neffmean = sum(nefflist) / float(len(nefflist))
        if not rlist:
            assert len(rhat_is_nan) == len(charlist)
            rmean = None
            logstring.append('\tAfter {0} MCMC chains each of {1} steps, '
                    'mean R = nan and mean Neff = {2}'.format(
                    nchains, niter, neffmean))
        else:
            rmean = sum(rlist) / float(len(rlist))
            logstring.append('\tAfter {0} MCMC chains each of {1} steps, '
                    'mean R = {2} and mean Neff = {3}'.format(
                    nchains, niter, rmean, neffmean))
        if rhat_is_nan:
            logstring.append('\t\tThere are {0} characters where R is nan'
                    .format(len(rhat_is_nan)))
        # allow convergence with stringent criteria when Rhat is nan
        # pystan appears to give Rhat of nan for sites with low preference
        # pystan Rhat values of nan are a bug according to pystan developers
        if ((len(rhat_is_nan) < 0.25 * len(charlist) and 
                rmean != None and rmean <= r_max and neffmean >= neff_min) 
                or (neffmean >= 3.0 * neff_min and ((rmean == None) 
                or (rmean != None and rmean <= 1.0 + 1.5 * (r_max - 1.0))))):
            # converged
            logstring.append('\tMCMC converged at {0}.'.format(time.asctime()))
            meanindex = colnames.index('mean')
            lower95index = colnames.index('2.5%')
            upper95index = colnames.index('97.5%')
            pi_means = dict([(c, summary[char_row_indices[c]][meanindex]) 
                    for c in charlist])
            pi_95credint = dict([(c, 
                    (summary[char_row_indices[c]][lower95index], 
                    summary[char_row_indices[c]][upper95index])) 
                    for c in charlist])
            return (True, pi_means, pi_95credint, '\n'.join(logstring))
        else:
            # failed to converge
            if ntry < increasetries:
                ntry += 1
                niter = int(niter * increasefac)
                logstring.append("\tMCMC failed to converge. Doing retry "
                        "{0} with {1} iterations per chain.".format(
                        ntry, niter))
            else:
                with open('_no_converge_prefs_debug.pickle', 'wb') as f_debug:
                    pickle.dump((counts, init, fitsummary), f_debug)
                logstring.append("\tMCMC FAILED to converge after "
                        "all attempts at {0}.".format(time.asctime()))
                meanindex = colnames.index('mean')
                lower95index = colnames.index('2.5%')
                upper95index = colnames.index('97.5%')
                pi_means = dict([(c, summary[char_row_indices[c]][meanindex]) 
                        for c in charlist])
                pi_95credint = dict([(c, 
                        (summary[char_row_indices[c]][lower95index], 
                        summary[char_row_indices[c]][upper95index])) 
                        for c in charlist])
                return (False, pi_means, pi_95credint, '\n'.join(logstring))


[docs]def prefsToMutFromWtEffects(prefs, charlist, wts):
    """Converts preferences effects of mutations away from wildtype.

    This function is similar to :func:`prefsToMutEffects` but
    computes the effects of mutations away from a given wildtype
    sequence.

    Args:
        `prefs`
            Same meaning as for :func:`prefsToMutEffects`.
        `charlist`
            Same meaning as for :func:`prefsToMutEffects`.
        `wts` (pandas DataFrame)
            Has columns named "site" and "wildtype" giving the
            wildtype amino acid for each site. The sites must
            match those in `prefs`.

    Returns:
        A pandas DataFrame similar to that returned by
        :func:`prefsToMutEffects` except that it only
        contains mutations away from the wildtype character
        at each site, and the columns specifying the wildtype
        and mutant characters are named "wildtype" and "mutant"
        rather than "initial" and "final".

    >>> charlist = ['A', 'C', 'G']
    >>> prefs = pandas.DataFrame({
    ...         'site':[1, 2],
    ...         'A':[0.25, 0.25],
    ...         'C':[0.25, 0.5],
    ...         'G':[0.5, 0.25],
    ...         })
    >>> wts = pandas.DataFrame({
    ...         'site':[1, 2],
    ...         'wildtype':['G', 'C']
    ...         })
    >>> prefsToMutFromWtEffects(prefs, charlist, wts)
       site wildtype mutant mutation  effect  log2effect
    0     1        G      A      G1A     0.5        -1.0
    1     1        G      C      G1C     0.5        -1.0
    2     1        G      G      G1G     1.0         0.0
    3     2        C      A      C2A     0.5        -1.0
    4     2        C      C      C2C     1.0         0.0
    5     2        C      G      C2G     0.5        -1.0
    """
    muteffects = prefsToMutEffects(prefs, charlist)

    if not ({'wildtype', 'site'} <= set(wts.columns)):
        raise ValueError("`wts` doesn't have site and wildtype cols")

    if not (set(wts.wildtype) <= set(charlist)):
        raise ValueError("wildtype not all in `charlist`")

    if (prefs.site.sort_values() != wts.site.sort_values()).any():
        raise ValueError("`prefs` and `wts` do not have same sites")

    assert 'site' in muteffects.columns
    assert 'wildtype' not in muteffects.columns

    muteffects_from_wt = (
        muteffects
        .merge(wts[['site', 'wildtype']], on='site')
        .query('initial == wildtype')
        .drop(columns=['initial'])
        .rename(columns={'final':'mutant'})
        [['site', 'wildtype', 'mutant', 'mutation',
          'effect', 'log2effect']]
        .sort_values(['site', 'mutant'])
        .reset_index(drop=True)
        )

    return muteffects_from_wt


[docs]def prefsToMutEffects(prefs, charlist):
    r"""Converts amino acid preferences to effects of specific mutations.

    If the preference of site :math:`r` for amino acid :math:`a` is
    :math:`\pi_{r,a}`, then the estimated effect (e.g., ratio of
    enrichment ratios) for mutating that site from :math:`x` to
    :math:`y` is :math:`\frac{\pi_{r,y}}{\pi_{r,x}}`. Very small
    values indicate disfavored mutations, and very large values
    indicate highly favored mutations. The logarithm base 2 of the
    expected effects is also a useful measure -- negative values
    are disfavored mutations, positive values are favored ones.

    Args:
        `prefs` (pandas DataFrame)
            Preferences to analyze. The columns should be `site` and
            a column for every character in `charlist`.
        `charlist` (list)
            The list of characters that we are analyzing. For instance,
            `dms_tools2.AAS` for amino acids.

    Returns:
        A pandas Data Frame where the columns are:

            * `site`: the site

            * `initial`: the initial character (e.g., amino acid)

            * `final`: the final character after the mutation

            * `mutation`: mutation in the string form `A1G`

            * `effect`: the effect of the mutation

            * `log2effect`: the log2 of the effect of the mutation.

    >>> charlist = ['A', 'C', 'G']
    >>> prefs = pandas.DataFrame({
    ...         'site':[1, 2],
    ...         'A':[0.25, 0.25],
    ...         'C':[0.25, 0.5],
    ...         'G':[0.5, 0.25],
    ...         })
    >>> effects = prefsToMutEffects(prefs, charlist)
    >>> set(effects.columns) == {'site', 'initial', 'final',
    ...         'mutation', 'effect', 'log2effect'}
    True
    >>> numpy.allclose(effects[effects['initial'] == 'A']['effect'],
    ...         [1, 1, 2, 1, 2, 1])
    True
    >>> numpy.allclose(effects[effects['initial'] == 'C']['effect'],
    ...         [1, 1, 2, 0.5, 1, 0.5])
    True
    >>> numpy.allclose(effects['effect'], 2**effects['log2effect'])
    True
    """
    if not (set(prefs.columns) <= set(['site'] + charlist)):
        raise ValueError("`prefs` contain extra columns")

    initial = prefs.melt(id_vars='site', value_vars=charlist,
                var_name='initial', value_name='initial_pref')
    final = initial.rename(columns=
            {'initial':'final', 'initial_pref':'final_pref'})
    effects = final.merge(initial, on='site')
    effects['effect'] = effects['final_pref'] / effects['initial_pref']
    effects['log2effect'] = numpy.log2(effects['effect'])
    effects['mutation'] = (effects['initial'] + 
            effects['site'].map(str) + effects['final'])
    return (effects
            .drop(['final_pref', 'initial_pref'], axis=1)
            .sort_values(['site', 'initial', 'final'])
            [['site', 'initial', 'final', 'mutation', 'effect', 'log2effect']]
            .reset_index(drop=True)
            )


[docs]def rescalePrefs(prefs, stringency):
    r"""Re-scale amino acid preferences by stringency parameter.

    If the initial preference of site :math:`r` for amino-acid 
    :math:`a` is :math:`\pi_{r,a}`, then the re-scaled preference is

    .. math::
    
        \frac{\left(\pi_{r,a}\right)^{\beta}}{\sum_{a'} \left(\pi_{r,a'}\right)}

    where :math:`\beta` is the stringency parameter.

    Args:
        `prefs` (pandas.DataFrame)
            Columns are 'site' and then every character (e.g.,
            amino acid).
        `stringency` (float >= 0)
            The stringency parameter.

    Returns:
        A data frame in the same format as `prefs` but with the
        re-scaled preferences.

    >>> prefs = pandas.DataFrame(dict(
    ...         [('site', [1, 2]), ('A', [0.24, 0.03]), ('C', [0.04, 0.43])]
    ...         + [(aa, [0.04, 0.03]) for aa in dms_tools2.AAS if
    ...         aa not in ['A', 'C']]))
    >>> numpy.allclose(1, prefs.drop('site', axis=1).sum(axis=1))
    True
    >>> rescaled = rescalePrefs(prefs, 1.0)
    >>> all([numpy.allclose(prefs[c], rescaled[c]) for c in prefs.columns])
    True
    >>> rescaled2 = rescalePrefs(prefs, 2.0)
    >>> all(rescaled2['site'] == prefs['site'])
    True
    >>> numpy.allclose(rescaled2['A'], [0.6545, 0.0045], atol=1e-3)
    True
    >>> numpy.allclose(rescaled2['C'], [0.0182, 0.9153], atol=1e-3)
    True
    """
    assert set(prefs.drop('site', axis=1).select_dtypes(
            include=[numpy.number]).columns) == set(
            prefs.drop('site', axis=1).columns), ('Non-numeric '
            'columns other than "site"')

    chars = prefs.drop('site', axis=1).columns
    rescaled = prefs[chars].pow(stringency)
    rescaled = rescaled.divide(rescaled.sum(axis=1), axis='index')
    rescaled['site'] = prefs['site']
    return rescaled[prefs.columns]


[docs]def avgPrefs(prefsfiles):
    """Gets average of site-specific preferences.

    Args:
        `prefsfiles` (list)
            List of CSV files containing preferences, must all be
            for same sites and characters.

    Returns:
        A `pandas.DataFrame` containing the average of the
        preferences in `prefsfiles`. In this returned
        data frame, `site` is the index

    >>> tf1 = tempfile.NamedTemporaryFile
    >>> tf2 = tempfile.NamedTemporaryFile
    >>> with tf1(mode='w') as file1, tf2(mode='w') as file2:
    ...     x = file1.write('site,A,C,G,T\\n'
    ...                 '10,0.2,0.2,0.5,0.1\\n'
    ...                 '2a,0.3,0.3,0.3,0.1')
    ...     file1.flush()
    ...     x = file2.write('site,A,C,G,T\\n'
    ...                 '10,0.4,0.1,0.1,0.4\\n'
    ...                 '2a,0.3,0.4,0.1,0.2')
    ...     file2.flush()
    ...     avg = avgPrefs([file1.name, file2.name])
    >>> (avg['site'] == ['2a', '10']).all()
    True
    >>> numpy.allclose(avg['A'], [0.3, 0.3])
    True
    >>> numpy.allclose(avg['C'], [0.35, 0.15])
    True
    >>> numpy.allclose(avg['G'], [0.2, 0.3])
    True
    >>> numpy.allclose(avg['T'], [0.15, 0.25])
    True
    """
    assert len(prefsfiles) >= 1
    prefs = [pandas.read_csv(f, index_col='site').sort_index()
            for f in prefsfiles]

    # make sure all have the same columns in the same order
    cols = prefs[0].columns
    for i in range(len(prefs)):
        assert set(cols) == set(prefs[i].columns)
        prefs[i] = prefs[i][cols]

    avgprefs = pandas.concat(prefs).groupby('site').mean().reset_index()

    # natural sort by site: https://stackoverflow.com/a/29582718
    avgprefs = avgprefs.reindex(index=natsort.order_by_index(avgprefs.index,
            natsort.index_realsorted(avgprefs.site)))

    return avgprefs


[docs]def prefsEntropy(prefs, charlist):
    r"""Calculate site entropy and number of effective characters.

    The site entropy :math:`h_r` for site :math:`r` is defined as
    :math:`h_r = -\sum_{x} \pi_{r,x} \log\left(\pi_{r,x}\right)`
    where :math:`\pi_{r,x}` is the preference for character (e.g.,
    amino acid) :math:`x` at site :math:`r`, and the log is
    the natural logarithm.

    The number of effective characters at site :math:`r`
    is :math:`N_{\rm{eff}, r} = \exp\left(h_r\right)`.

    Args:
        `prefs` (pandas DataFrame)
            Data frame with the preferences. Should have a column
            named `site` and a column for each character in
            `charlist`. 
        `charlist` (list)
            List of the characters of interest, for example
            the amino acids.

    Returns:
        A copy of `prefs` that has additional columns named
        `entropy` and `neffective`, giving the site entropy and
        number of effective characters for each site. For
        the entropies, log is taken to base :math:`e`.

    >>> charlist = ['A', 'C', 'G', 'T']
    >>> prefs = pandas.DataFrame({
    ...         'site':[1, 2, 3],
    ...         'A':[0.25, 0.6, 1 / 3.],
    ...         'C':[0.25, 0.3, 1 / 3.],
    ...         'G':[0.25, 0.1, 1 / 3.],
    ...         'T':[0.25, 0.0, 0.0],
    ...         })
    >>> prefs_entropy = prefsEntropy(prefs, charlist)
    >>> (set(['entropy', 'neffective', 'site'] + charlist) == 
    ...         set(prefs_entropy.columns))
    True
    >>> h2 = -0.6 * math.log(0.6) - 0.3 * math.log(0.3) - 0.1 * math.log(0.1)
    >>> numpy.allclose(prefs_entropy['entropy'], [math.log(4), h2, math.log(3)])
    True
    >>> numpy.allclose(prefs_entropy['neffective'], [4, math.exp(h2), 3])
    True
    """
    assert 'site' in prefs.columns, "prefs does not have `site` column"
    assert set(charlist) < set(prefs.columns), "cols do not contain charlist"
    assert charlist, "no characters specified"
    assert numpy.allclose(prefs[charlist].sum(axis=1), 1, atol=1e-4),\
            "prefs do not sum to one"
    prefs_entropy = prefs.copy()
    prefs_entropy['entropy'] = 0
    for char in charlist:
        p_char = prefs_entropy[char].where(prefs_entropy[char] > 0, 1)
        prefs_entropy['entropy'] -= numpy.log(p_char) * p_char
    prefs_entropy['neffective'] = numpy.exp(prefs_entropy['entropy'])
    return prefs_entropy


[docs]def aafreqsFromAlignment(alignmentfile, codon_to_aa,
        ignore_gaps=True, ignore_stop=True):
    """Get amino-acid frequencies at each site in alignment.

    Args:
        `alignmentfile` (str)
            FASTA file with alignment of proteins or coding sequences.
        `codon_to_aa` (bool)
            If `True`, translate codon alignment to amino acids.
        `ignore_gaps` (bool)
            Ignore gaps when calculating frequencies.
        `ignore_stop` (bool)
            Ignore stop codons when calculating frequencies.

    Returns:
        A `pandas.DataFrame` with columns being `site` (1, 2, ...
        numbering) and other columns being amino acids and values
        giving frequencies in alignment.

    >>> with tempfile.NamedTemporaryFile(mode='w') as f:
    ...     x = f.write('>seq1\\n'
    ...                 'ATGGGGCAG\\n'
    ...                 '>seq2\\n'
    ...                 '---AGGCAG\\n'
    ...                 '>seq3\\n'
    ...                 'ATGTGACAG')
    ...     f.flush()
    ...     aafreqs = aafreqsFromAlignment(f.name, codon_to_aa=True)
    >>> aas_counts = ['M', 'G', 'R', 'Q']
    >>> aas_nocounts = [a for a in dms_tools2.AAS if a not in aas_counts]
    >>> (0 == aafreqs[aas_nocounts].values).all()
    True
    >>> expected_counts = pandas.DataFrame.from_dict(
    ...         collections.OrderedDict([
    ...         ('site', [1, 2, 3]), ('M', [1.0, 0.0, 0.0]),
    ...         ('G', [0.0, 0.5, 0]), ('R', [0.0, 0.5, 0.0]),
    ...         ('Q', [0.0, 0.0, 1.0])]))
    >>> expected_counts.equals(aafreqs[['site'] + aas_counts])
    True
    """
    # read sequences
    seqs = [s.seq for s in Bio.SeqIO.parse(alignmentfile, 'fasta')]
    if codon_to_aa:
        seqs = [s.translate(gap='-', stop_symbol='*') for s in seqs]
    assert seqs, "No sequences"
    seqlen = len(seqs[0])
    assert seqlen, "sequences have no length"
    assert all([seqlen == len(s) for s in seqs]), "seqs not same length"

    # get character sets
    aas = dms_tools2.AAS.copy()
    skipchars = []
    if ignore_gaps:
        skipchars.append('-')
    else:
        aas.append('-')
    if ignore_stop:
        skipchars.append('*')
    else:
        aas.append('*')

    # tally amino-acid frequencies
    aafreqs = dict([(col, [0] * seqlen) for col in aas])
    aafreqs['site'] = list(range(1, seqlen + 1))
    for s in seqs:
        for (r, aa) in enumerate(s):
            if aa in skipchars:
                continue
            else:
                aafreqs[aa][r] += 1

    # convert to dataframe and change counts to freqs
    aafreqs = pandas.DataFrame(aafreqs)
    ncounts = aafreqs[aas].sum(axis=1).astype('float')
    for aa in aas:
        aafreqs[aa] = aafreqs[aa] / ncounts

    return aafreqs[['site'] + aas].fillna(0)



if __name__ == '__main__':
    import doctest
    doctest.testmod()
Source code for dms_tools2.prefs

dms_tools2

Navigation

Related Topics