"""
========================
prefs
========================
Performs operations related to estimating site-specific amino-acid
preferences.
Uses `pystan <https://pystan.readthedocs.io/en/latest>`_
to perform MCMC for Bayesian inferences.
"""
import time
import math
import tempfile
import pickle
import random
import collections
import natsort
import numpy
import numpy.random
import pandas
import Bio.SeqIO
import pystan
import dms_tools2
#: minimum value for Dirichlet prior elements
PRIOR_MIN_VALUE = 1.0e-7
[docs]class StanModelNoneErr(object):
"""``pystan`` model when `error_model` is `none`.
For use by inferSitePrefs`."""
def __init__(self, verbose=False):
"""Compile ``pystan`` model.
Args:
`verbose` (bool)
Set to `True` if you want verbose compilation.
"""
self.pystancode =\
"""
data {{
int<lower=1> Nchar; // 64 for codons, 20 for amino acids, 4 for nucleotides
int<lower=0> nrpre[Nchar]; // counts pre-selection
int<lower=0> nrpost[Nchar]; // counts post-selection
vector<lower={0:g}>[Nchar] pir_prior_params; // Dirichlet prior params
vector<lower={0:g}>[Nchar] mur_prior_params; // Dirichlet prior params
}}
parameters {{
simplex[Nchar] pir;
simplex[Nchar] mur;
}}
transformed parameters {{
simplex[Nchar] fr;
fr = pir .* mur / dot_product(pir, mur);
}}
model {{
pir ~ dirichlet(pir_prior_params);
mur ~ dirichlet(mur_prior_params);
nrpre ~ multinomial(mur);
nrpost ~ multinomial(fr);
}}
""".format(PRIOR_MIN_VALUE)
self.model = pystan.StanModel(model_code=self.pystancode,
verbose=verbose)
[docs]class StanModelSameErr:
"""``pystan`` model when `error_model` is `same`.
For use by inferSitePrefs`."""
def __init__(self, verbose=False):
"""Compile ``pystan`` model.
Args:
`verbose` (bool)
Set to `True` if you want verbose compilation.
"""
self.pystancode =\
"""
data {{
int<lower=1> Nchar; // 64 for codons, 20 for amino acids, 4 for nucleotides
int<lower=1, upper=Nchar> iwtchar; // index of wildtype character in 1, ... numbering
int<lower=0> nrpre[Nchar]; // counts pre-selection
int<lower=0> nrpost[Nchar]; // counts post-selection
int<lower=0> nrerr[Nchar]; // counts in error control
vector<lower={0:g}>[Nchar] pir_prior_params; // Dirichlet prior params
vector<lower={0:g}>[Nchar] mur_prior_params; // Dirichlet prior params
vector<lower={0:g}>[Nchar] epsilonr_prior_params; // Dirichlet prior params
}}
transformed data {{
simplex[Nchar] deltar;
for (ichar in 1:Nchar) {{
deltar[ichar] = 0.0;
}}
deltar[iwtchar] = 1.0;
}}
parameters {{
simplex[Nchar] pir;
simplex[Nchar] mur;
simplex[Nchar] epsilonr;
}}
transformed parameters {{
simplex[Nchar] fr_plus_err;
simplex[Nchar] mur_plus_err;
fr_plus_err = pir .* mur / dot_product(pir, mur) + epsilonr - deltar;
mur_plus_err = mur + epsilonr - deltar;
}}
model {{
pir ~ dirichlet(pir_prior_params);
mur ~ dirichlet(mur_prior_params);
epsilonr ~ dirichlet(epsilonr_prior_params);
nrerr ~ multinomial(epsilonr);
nrpre ~ multinomial(mur_plus_err);
nrpost ~ multinomial(fr_plus_err);
}}
""".format(PRIOR_MIN_VALUE)
self.model = pystan.StanModel(model_code=self.pystancode,
verbose=verbose)
[docs]class StanModelDifferentErr:
"""``pystan`` model when `error_model` is `different`.
For use by inferSitePrefs`."""
def __init__(self, verbose=False):
"""Compile ``pystan`` model.
Args:
`verbose` (bool)
Set to `True` if you want verbose compilation.
"""
self.pystancode =\
"""
data {{
int<lower=1> Nchar; // 64 for codons, 20 for amino acids, 4 for nucleotides
int<lower=1, upper=Nchar> iwtchar; // index of wildtype character in 1, ... numbering
int<lower=0> nrpre[Nchar]; // counts pre-selection
int<lower=0> nrpost[Nchar]; // counts post-selection
int<lower=0> nrerrpre[Nchar]; // counts in pre-selection error control
int<lower=0> nrerrpost[Nchar]; // counts in post-selection error control
vector<lower={0:g}>[Nchar] pir_prior_params; // Dirichlet prior params
vector<lower={0:g}>[Nchar] mur_prior_params; // Dirichlet prior params
vector<lower={0:g}>[Nchar] epsilonr_prior_params; // Dirichlet prior params
vector<lower={0:g}>[Nchar] rhor_prior_params; // Dirichlet prior params
}}
transformed data {{
simplex[Nchar] deltar;
for (ichar in 1:Nchar) {{
deltar[ichar] = 0.0;
}}
deltar[iwtchar] = 1.0;
}}
parameters {{
simplex[Nchar] pir;
simplex[Nchar] mur;
simplex[Nchar] epsilonr;
simplex[Nchar] rhor;
}}
transformed parameters {{
simplex[Nchar] fr_plus_err;
simplex[Nchar] mur_plus_err;
fr_plus_err = pir .* mur / dot_product(pir, mur) + rhor - deltar;
mur_plus_err = mur + epsilonr - deltar;
}}
model {{
pir ~ dirichlet(pir_prior_params);
mur ~ dirichlet(mur_prior_params);
epsilonr ~ dirichlet(epsilonr_prior_params);
rhor ~ dirichlet(rhor_prior_params);
nrerrpre ~ multinomial(epsilonr);
nrerrpost ~ multinomial(rhor);
nrpre ~ multinomial(mur_plus_err);
nrpost ~ multinomial(fr_plus_err);
}}
""".format(PRIOR_MIN_VALUE)
self.model = pystan.StanModel(model_code=self.pystancode,
verbose=verbose)
def _initialValuePrefs(error_model, nchains, iwtchar, nchars):
"""Gets valid initial values for ``pystan`` preference inference.
Values initialized by ``pystan`` frequently have invalid values.
This function will generate random values that are valid
for initialization and return a list that can be passed to the
`StanModel` as the `init` argument.
"""
initattempts = 10 # might fail for pathological random values
nrescales = 10 # rescale non-wildtype values down this many times
rescalefactor = 5.0 # rescale non-wildtype values down by this much
deltar = numpy.zeros(nchars)
deltar[iwtchar] = 1.0
init = []
concparams = [1.0 for ichar in range(nchars)]
concparams[iwtchar] = 10.0 # make this element bigger
if error_model == 'none':
for chain in range(nchains):
while True:
chain_init = {
'pir':numpy.random.dirichlet(concparams),
'mur':numpy.random.dirichlet(concparams),
}
if (all(chain_init['pir'] > PRIOR_MIN_VALUE) and
all(chain_init['mur'] > PRIOR_MIN_VALUE)):
init.append(chain_init)
break
return init
elif error_model == 'same':
posterrname = 'epsilonr'
elif error_model == 'different':
posterrname = 'rhor'
else:
raise ValueError("Invalid error_model {0}".format(error_model))
for chain in range(nchains):
for iattempt in range(initattempts):
chain_init = {
'pir':numpy.random.dirichlet(concparams),
'mur':numpy.random.dirichlet(concparams),
'epsilonr':numpy.random.dirichlet(concparams),
}
if error_model == 'different':
chain_init['rhor'] = numpy.random.dirichlet(concparams)
irescale = 0
while irescale < nrescales and (any(
chain_init['pir'] * chain_init['mur'] /
numpy.dot(chain_init['pir'], chain_init['mur'])
+ chain_init[posterrname] - deltar < PRIOR_MIN_VALUE)
or any(chain_init['mur'] + chain_init['epsilonr']
- deltar < PRIOR_MIN_VALUE)):
irescale += 1
chain_init['epsilonr'] /= rescalefactor
chain_init['epsilonr'][iwtchar] = 1.0 - sum(
[chain_init['epsilonr'][ichar] for ichar
in range(nchars) if ichar != iwtchar])
if error_model == 'different':
chain_init['rhor'] /= rescalefactor
chain_init['rhor'][iwtchar] = 1.0 - sum([
chain_init['rhor'][ichar] for ichar in
range(nchars) if ichar != iwtchar])
if (all(chain_init['pir'] * chain_init['mur'] /
numpy.dot(chain_init['pir'], chain_init['mur']) +
chain_init[posterrname] - deltar > PRIOR_MIN_VALUE)
and all(chain_init['mur'] + chain_init['epsilonr'] -
deltar > PRIOR_MIN_VALUE)):
break
else:
raise ValueError("Failed to initialize")
init.append(chain_init)
return init
[docs]def inferPrefsByRatio(charlist, sites, wts, pre, post, errpre,
errpost, pseudocount):
r"""Site-specific preferences from normalized enrichment ratios.
Calculates site-specific preference :math:`\pi_{r,a}` of each
site :math:`r` for each character :math:`a` using re-normalized
enrichment ratios. This function accomplishes the same goal as
`inferSitePrefs`, but is much simpler and faster, although less
statistically rigorous in principle.
Specifically, this function calculates the preferences as
.. math::
\pi_{r,a} = \frac{\phi_{r,a}}{\sum_{a'} \phi_{r,a'}}
where :math:`\phi_{r,a}` is the enrichment ratio of character
:math:`a` relative to wildtype after selection.
The actual definition of these enrichment ratios is somewhat
complex due to the need to correct for errors and add a
pseudocount. We have 4 sets of counts:
- `pre`: pre-selection counts
- `post`: post-selection counts
- `errpre`: error-control for pre-selection counts
- `errpost`: error-control for post-selection counts
For each set of counts :math:`s`, let :math:`N_r^s` (e.g.,
:math:`N_r^{pre}`) denote the total counts for this sample at
site :math:`r`, and let :math:`n_{r,a}^{s}` (e.g.,
:math:`n_{r,a}^{pre}`) denote the counts at that site for
character :math:`a`.
Because some of the counts are low, we add a pseudocount
:math:`P` to each observation. Importantly, we need to scale
this pseudocount by the total depth for that sample at that site
in order to avoid systematically estimating different frequencies
purely as a function of sequencing depth, which will bias
the preference estimates.
Therefore, given the pseudocount value :math:`P` defined via the
``--pseudocount`` argument to :ref:`dms2_prefs`, we calculate the
scaled pseudocount for sample :math:`s` and site :math:`r` as
.. math::
P_r^s = P \times \frac{N_r^s}{\min\limits_{s'} N_r^{s'}}.
This equation makes the pseudocount :math:`P` for the lowest-depth
sample, and scales up the pseodocounts for all other samples
proportional to their depth.
With this pseudocount, the
estimated frequency of character :math:`a` at site :math:`r`
in sample :math:`s` is
.. math::
f_{r,a}^s = \frac{n_{r,a}^s + P_r^s}{N_{r}^s + A \times P_r^s}
where :math:`A` is the number of characters in the alphabet (e.g.,
20 for amino acids without stop codons).
The **error-corrected** estimates of the frequency of character
:math:`a` before and after selection are then
.. math::
f_{r,a}^{before} &= \max\left(\frac{P_r^{pre}}{N_{r}^{pre} + A \times P_r^{pre}},
\; f_{r,a}^{pre} + \delta_{a,\rm{wt}\left(r\right)}
- f_{r,a}^{errpre}\right)
f_{r,a}^{after} &= \max\left(\frac{P_r^{post}}{N_{r}^{post} + A \times P_r^{post}},
\; f_{r,a}^{post} + \delta_{a,\rm{wt}\left(r\right)}
- f_{r,a}^{errpost}\right)
where :math:`\delta_{a,\rm{wt}\left(r\right)}` is the
Kronecker-delta, equal to 1 if :math:`a` is the same as the
wildtype character :math:`\rm{wt}\left(r\right)` at site
:math:`r`, and 0 otherwise. We use the :math:`\max` operator
to ensure that even if the error-control frequency exceeds
the actual estimate, our estimated frequency never drops
below the pseudocount over the depth of the uncorrected sample.
Given these definitions, we then simply calculate the enrichment
ratios as
.. math::
\phi_{r,a} = \frac{\left(f_{r,a}^{after}\right) /
\left(f_{r,\rm{wt}\left(r\right)}^{after}\right)}
{\left(f_{r,a}^{before}\right) /
\left(f_{r,\rm{wt}\left(r\right)}^{before}\right)}
In the case where we are **not** using any error-controls, then
we simply set
:math:`f_{r,\rm{wt}}^{errpre} = f_{r,\rm{wt}}^{errpost}
= \delta_{a,\rm{wt}\left(r\right)}`.
Args:
`charlist` (list)
Valid characters (e.g., codons, amino acids, nts).
`sites` (list)
Sites to analyze.
`wts` (list)
`wts[r]` is the wildtype character at site `sites[r]`.
`pre` (pandas.DataFrame)
Gives pre-selection counts. Should have columns
with names of 'site' and all characters in `charlist`.
The rows give the counts of each character at that site.
`post` (pandas.DataFrame)
Like `pre` but for post-selection counts.
`errpre` (`None` or pandas.DataFrame)
Like `pre` but for pre-selection error-control counts,
or `None` if there is no such control.
`errpost` (`None` or pandas.DataFrame)
Like `pre` but for post-selection error-control counts,
or `None` if there is no such control.
`pseudocount` (float or int)
The pseudocount to add to each observation.
Returns:
A pandas.DataFrame holding the preferences. The columns of
this dataframe are 'site' and all characters in `charlist`.
For each site in `sites`, the rows give the preference
for that character.
"""
assert len(wts) == len(sites) > 0
assert all([wt in charlist for wt in wts]), "invalid char in wts"
assert pseudocount > 0, "pseudocount must be greater than zero"
dfs = {'pre':pre.copy(),
'post':post.copy()
}
if errpre is not None:
dfs['errpre'] = errpre.copy()
if errpost is not None:
dfs['errpost'] = errpost.copy()
# compute total depth for each sample and sort by sites
depths = []
for stype in dfs.keys():
df = dfs[stype]
assert set(list(charlist) + ['site']) <= set(df.columns)
assert set(sites) <= set(df['site'])
df = df.query('site in @sites')
assert len(df.index) == len(wts) == len(sites)
df['site'] = pandas.Categorical(df['site'], sites)
df = df.sort_values('site').reset_index(drop=True)
df['Nr'] = df[charlist].sum(axis=1).astype('float')
depths.append(df['Nr'])
dfs[stype] = df
mindepth = pandas.concat(depths, axis=1).min(axis=1)
# calculate scaled pseudocounts
pseudocounts = dict([(stype,
pseudocount * (dfs[stype]['Nr'] / mindepth).fillna(1.0))
for stype in dfs.keys()])
# calculate frequencies
for stype in dfs.keys():
df = dfs[stype]
for c in charlist:
df['f{0}'.format(c)] = (df[c] + pseudocounts[stype]) / (
df['Nr'] + len(charlist) * pseudocounts[stype])
df['delta{0}'.format(c)] = [int(wt == c) for wt in wts]
dfs[stype] = df
# fill values for errpre / errpost if they were None
for stype in ['pre', 'post']:
err = 'err{0}'.format(stype)
if err not in dfs:
dfs[err] = pandas.DataFrame(dict([('site', sites)] +
[('f{0}'.format(c), dfs[stype]['delta{0}'.format(c)])
for c in charlist]))
# compute phi values
fr = {}
for (key, f, ferr, p) in [
('before', dfs['pre'], dfs['errpre'], pseudocounts['pre']),
('after', dfs['post'], dfs['errpost'], pseudocounts['post']),
]:
fr[key] = {'wt':numpy.zeros(len(sites))}
for c in charlist:
fr[key][c] = numpy.maximum(
p / (f['Nr'].values + len(charlist) * p),
f['f{0}'.format(c)].values
+ f['delta{0}'.format(c)].values
- ferr['f{0}'.format(c)].values
)
fr[key]['wt'] += fr[key][c] * f['delta{0}'.format(c)].values
phi = pandas.DataFrame({'site':sites})
for c in charlist:
phi[c] = ((fr['after'][c] / fr['after']['wt']) /
(fr['before'][c] / fr['before']['wt']))
phi['denom'] = phi[charlist].sum(axis=1)
# normalize phi values to get pi values
prefs = phi[charlist].div(phi['denom'], axis=0)
prefs.insert(0, 'site', sites)
return prefs
[docs]def inferSitePrefs(charlist, wtchar, error_model, counts,
priors, seed=1, niter=10000, increasetries=5, n_jobs=1,
r_max=1.1, neff_min=100, nchains=4, increasefac=2):
r"""Infers site-specific preferences by MCMC for a specific site.
Infer the site-specific preferences :math:`\pi_{r,a}` for some site
:math:`r` for each character :math:`a` by integrating over the
posterior defined by the product of the following priors and
likelihoods, where :math:`\boldsymbol{\mathbf{\pi_r}}` indicates
the vector of :math:`\pi_{r,a}` values for all characters:
.. math::
\Pr\left(\boldsymbol{\mathbf{\pi_r}}\right)
&= \mathrm{Dirichlet}\left(\boldsymbol{\mathbf{a_{\pi,r}}}\right)
\Pr\left(\boldsymbol{\mathbf{\mu_r}}\right)
&= \mathrm{Dirichlet}\left(\boldsymbol{\mathbf{a_{\mu,r}}}\right)
\Pr\left(\boldsymbol{\mathbf{\epsilon_r}}\right)
&= \mathrm{Dirichlet}\left(\boldsymbol{\mathbf{a_{\epsilon,r}}}\right)
\Pr\left(\boldsymbol{\mathbf{\rho_r}}\right)
&= \mathrm{Dirichlet}\left(\boldsymbol{\mathbf{a_{\rho,r}}}\right)
\Pr\left(\boldsymbol{\mathbf{n_r^{\rm{pre}}}} \mid \boldsymbol{\mathbf{\mu_r}}, \boldsymbol{\mathbf{\epsilon_r}}\right)
&= \mathrm{Multinomial}\left(\boldsymbol{\mathbf{n_r^{\rm{pre}}}}; \boldsymbol{\mathbf{\mu_r}} + \boldsymbol{\mathbf{\epsilon_r}} - \boldsymbol{\mathbf{\delta_r}}\right)
\Pr\left(\boldsymbol{\mathbf{n_r^{\rm{post}}}} \mid \boldsymbol{\mathbf{\mu_r}}, \boldsymbol{\mathbf{\epsilon_r}}\right)
&= \mathrm{Multinomial}\left(\boldsymbol{\mathbf{n_r^{\rm{post}}}}; \frac{\boldsymbol{\mathbf{\mu_r}} \circ \boldsymbol{\mathbf{\pi_r}}}{\boldsymbol{\mathbf{\mu_r}} \cdot \boldsymbol{\mathbf{\pi_r}}} + \boldsymbol{\mathbf{\rho_r}} - \boldsymbol{\mathbf{\delta_r}}\right)
\Pr\left(\boldsymbol{\mathbf{n_r^{\rm{err,pre}}}} \mid \boldsymbol{\mathbf{\epsilon_r}}\right)
&= \mathrm{Multinomial}\left(\boldsymbol{\mathbf{n_r^{\rm{err,pre}}}}; \boldsymbol{\mathbf{\epsilon_r}}\right)
\Pr\left(\boldsymbol{\mathbf{n_r^{\rm{err,post}}}} \mid \boldsymbol{\mathbf{\rho_r}}\right)
&= \mathrm{Multinomial}\left(\boldsymbol{\mathbf{n_r^{\rm{err,post}}}}; \boldsymbol{\mathbf{\rho_r}}\right)
where :math:`\boldsymbol{\mathbf{\delta_r}}` is a vector that is
zero except for a one at the element corresponding to the wildtype.
The MCMC tries to guarantee convergence via the parameters specified
by `r_max`, `neff_min`, `nchains`, `niter`, `increasefac`,
and `increasetries`.
Args:
`charlist` (list)
List of valid characters (e.g., codons, amino acids, nts).
`wtchar` (str)
Wildtype character at the site.
`error_model` (str or object)
Specifies how errors are estimated. Passing error model
objects is faster if calling this function repeatedly
as they will not need to be compiled. Can be:
- The str `none` or instance of `StanModelNoneErr`:
no errors (:math:`\boldsymbol{\mathbf{\epsilon_r}} = \boldsymbol{\mathbf{\rho_r}} = \mathbf{0}`)
- The str `same` or instance of `StanModelSameErr`:
same error rates pre and post-selection
(:math:`\boldsymbol{\mathbf{\epsilon_r}} = \boldsymbol{\mathbf{\rho_r}}`).
- The str `different` or instance of `StanModelDifferentErr`:
different error rates pre- and post-selection
(:math:`\boldsymbol{\mathbf{\epsilon_r}} \ne \boldsymbol{\mathbf{\rho_r}}`).
`counts` (dict)
Deep sequencing counts. Each string key should specify
a dict keyed by all characters in `charlist` with
the values giving integer counts for that character. The keys:
- `pre`: :math:`\boldsymbol{\mathbf{n_r^{\rm{pre}}}}`
- `post`: :math:`\boldsymbol{\mathbf{n_r^{\rm{post}}}}`
- *err*: required if `error_model` is `same`, specifies
:math:`\boldsymbol{\mathbf{n_r^{\rm{err,pre}}}} = \boldsymbol{\mathbf{n_r^{\rm{err,post}}}}`
- `errpre` and `errpost`: required if `error_model` is
`different`, specify
:math:`\boldsymbol{\mathbf{n_r^{\rm{err,pre}}}}` and
:math:`\boldsymbol{\mathbf{n_r^{\rm{err,post}}}}`.
`priors` (dict)
Specifies parameter vectors for Dirichlet priors. Each string
key should specify a dict keyed by all characters and values
giving the prior for that character. Values less than
`PRIOR_MIN_VALUE` are set to `PRIOR_MIN_VALUE`. Keys are
- `pir_prior_params`: :math:`\boldsymbol{\mathbf{a_{\pi,r}}}`
- `mur_prior_params`: :math:`\boldsymbol{\mathbf{a_{\mu,r}}}`
- `epsilonr_prior_params`: only required if `error_model` is
`same` or `different`, specifies
:math:`\boldsymbol{\mathbf{a_{\epsilon,r}}}`
- `rhor_prior_params` : only required if `error_model` is
`different`, specifies
:math:`\boldsymbol{\mathbf{a_{\rho,r}}}`
`seed` (int)
Random number seed for MCMC.
`n_jobs` (int)
Number of CPUs to use, -1 means all available.
`niter`, `increasetries`, `r_max`, `neff_min`, `nchains`, `increasefac`
Specify MCMC convergence. They all have reasonable defaults.
The MCMC is considered to have converged if the mean Gelman-Rubin R
statistic (http://www.jstor.org/stable/2246093) over all
:math:`\pi_{r,x}` values <= `r_max` and the mean effective sample
size is >= `neff_min`. The MCMC first runs with `nchains` chains
and `niter` iterations. If it fails to converge, it increases
the iterations by a factor of `increasefac` and tries again,
and repeats until it converges or has tried `increasetries`
times to increase the number of iterations. If the effective
sample size exceeds 3 times `neff_min` then we allow
R to be `1 + 1.5 (r_max - 1)`.
Returns:
The tuple `(converged, pi_means, pi_95credint, logstring)` where:
- `converged` is `True` if the MCMC converged, `False` otherwise.
- `pi_means` is dict keyed by characters in `charlist`
with the value giving the :math:`\pi_{r,a}`.
- `pi_95credint` is dict keyed by characters, values are 2-tuples
giving median-centered credible interval for :math:`\pi_{r,a}`.
- `logstring` is a string describing MCMC run and convergence.
"""
logstring = ['\tBeginning MCMC at %s' % time.asctime()]
random.seed(seed)
numpy.random.seed(seed)
assert nchains >= 2, "nchains must be at least two"
assert niter >= 100, "niter must be at least 100"
assert len(charlist) == len(set(charlist))
assert wtchar in charlist
data = {'Nchar':len(charlist),
'iwtchar':charlist.index(wtchar) + 1,
'nrpre':[counts['pre'][c] for c in charlist],
'nrpost':[counts['post'][c] for c in charlist],
'pir_prior_params':[max(PRIOR_MIN_VALUE,
priors['pir_prior_params'][c]) for c in charlist],
'mur_prior_params':[max(PRIOR_MIN_VALUE,
priors['mur_prior_params'][c]) for c in charlist],
}
if error_model == 'none':
sm = StanModelNoneErr().model
elif isinstance(error_model, StanModelNoneErr):
sm = error_model.model
error_model = 'none'
elif error_model == 'same' or isinstance(error_model, StanModelSameErr):
if error_model == 'same':
sm = StanModelSameErr().model
else:
sm = error_model.model
error_model = 'same'
data['nrerr'] = [counts['err'][c] for c in charlist]
data['epsilonr_prior_params'] = [max(PRIOR_MIN_VALUE,
priors['epsilonr_prior_params'][c]) for c in charlist]
elif (error_model == 'different' or
isinstance(error_model, StanModelDifferentErr)):
if error_model == 'different':
sm = StanModelDifferentErr().model
else:
sm = error_model.model
error_model = 'different'
data['nrerrpre'] = [counts['errpre'][c] for c in charlist]
data['nrerrpost'] = [counts['errpost'][c] for c in charlist]
data['epsilonr_prior_params'] = [max(PRIOR_MIN_VALUE,
priors['epsilonr_prior_params'][c]) for c in charlist]
data['rhor_prior_params'] = [max(PRIOR_MIN_VALUE,
priors['rhor_prior_params'][c]) for c in charlist]
else:
raise ValueError("Invalid error_model {0}".format(error_model))
ntry = 0
while True: # run until converged or tries exhausted
init = _initialValuePrefs(error_model, nchains,
charlist.index(wtchar), len(charlist))
fit = sm.sampling(data=data, iter=niter, chains=nchains,
seed=seed, n_jobs=n_jobs, refresh=-1, init=init)
# extract output
fitsummary = fit.summary()
rownames = list(fitsummary['summary_rownames'])
colnames = list(fitsummary['summary_colnames'])
summary = fitsummary['summary']
char_row_indices = dict([(c, rownames.index('pir[{0}]'.format(
charlist.index(c) + 1))) for c in charlist])
rindex = colnames.index('Rhat')
neffindex = colnames.index('n_eff')
rlist = [summary[char_row_indices[c]][rindex] for c in charlist]
rhat_is_nan = [rhat for rhat in rlist if math.isnan(rhat)]
rlist = [rhat for rhat in rlist if not math.isnan(rhat)]
nefflist = [summary[char_row_indices[c]][neffindex] for c in charlist]
neffmean = sum(nefflist) / float(len(nefflist))
if not rlist:
assert len(rhat_is_nan) == len(charlist)
rmean = None
logstring.append('\tAfter {0} MCMC chains each of {1} steps, '
'mean R = nan and mean Neff = {2}'.format(
nchains, niter, neffmean))
else:
rmean = sum(rlist) / float(len(rlist))
logstring.append('\tAfter {0} MCMC chains each of {1} steps, '
'mean R = {2} and mean Neff = {3}'.format(
nchains, niter, rmean, neffmean))
if rhat_is_nan:
logstring.append('\t\tThere are {0} characters where R is nan'
.format(len(rhat_is_nan)))
# allow convergence with stringent criteria when Rhat is nan
# pystan appears to give Rhat of nan for sites with low preference
# pystan Rhat values of nan are a bug according to pystan developers
if ((len(rhat_is_nan) < 0.25 * len(charlist) and
rmean != None and rmean <= r_max and neffmean >= neff_min)
or (neffmean >= 3.0 * neff_min and ((rmean == None)
or (rmean != None and rmean <= 1.0 + 1.5 * (r_max - 1.0))))):
# converged
logstring.append('\tMCMC converged at {0}.'.format(time.asctime()))
meanindex = colnames.index('mean')
lower95index = colnames.index('2.5%')
upper95index = colnames.index('97.5%')
pi_means = dict([(c, summary[char_row_indices[c]][meanindex])
for c in charlist])
pi_95credint = dict([(c,
(summary[char_row_indices[c]][lower95index],
summary[char_row_indices[c]][upper95index]))
for c in charlist])
return (True, pi_means, pi_95credint, '\n'.join(logstring))
else:
# failed to converge
if ntry < increasetries:
ntry += 1
niter = int(niter * increasefac)
logstring.append("\tMCMC failed to converge. Doing retry "
"{0} with {1} iterations per chain.".format(
ntry, niter))
else:
with open('_no_converge_prefs_debug.pickle', 'wb') as f_debug:
pickle.dump((counts, init, fitsummary), f_debug)
logstring.append("\tMCMC FAILED to converge after "
"all attempts at {0}.".format(time.asctime()))
meanindex = colnames.index('mean')
lower95index = colnames.index('2.5%')
upper95index = colnames.index('97.5%')
pi_means = dict([(c, summary[char_row_indices[c]][meanindex])
for c in charlist])
pi_95credint = dict([(c,
(summary[char_row_indices[c]][lower95index],
summary[char_row_indices[c]][upper95index]))
for c in charlist])
return (False, pi_means, pi_95credint, '\n'.join(logstring))
[docs]def prefsToMutFromWtEffects(prefs, charlist, wts):
"""Converts preferences effects of mutations away from wildtype.
This function is similar to :func:`prefsToMutEffects` but
computes the effects of mutations away from a given wildtype
sequence.
Args:
`prefs`
Same meaning as for :func:`prefsToMutEffects`.
`charlist`
Same meaning as for :func:`prefsToMutEffects`.
`wts` (pandas DataFrame)
Has columns named "site" and "wildtype" giving the
wildtype amino acid for each site. The sites must
match those in `prefs`.
Returns:
A pandas DataFrame similar to that returned by
:func:`prefsToMutEffects` except that it only
contains mutations away from the wildtype character
at each site, and the columns specifying the wildtype
and mutant characters are named "wildtype" and "mutant"
rather than "initial" and "final".
>>> charlist = ['A', 'C', 'G']
>>> prefs = pandas.DataFrame({
... 'site':[1, 2],
... 'A':[0.25, 0.25],
... 'C':[0.25, 0.5],
... 'G':[0.5, 0.25],
... })
>>> wts = pandas.DataFrame({
... 'site':[1, 2],
... 'wildtype':['G', 'C']
... })
>>> prefsToMutFromWtEffects(prefs, charlist, wts)
site wildtype mutant mutation effect log2effect
0 1 G A G1A 0.5 -1.0
1 1 G C G1C 0.5 -1.0
2 1 G G G1G 1.0 0.0
3 2 C A C2A 0.5 -1.0
4 2 C C C2C 1.0 0.0
5 2 C G C2G 0.5 -1.0
"""
muteffects = prefsToMutEffects(prefs, charlist)
if not ({'wildtype', 'site'} <= set(wts.columns)):
raise ValueError("`wts` doesn't have site and wildtype cols")
if not (set(wts.wildtype) <= set(charlist)):
raise ValueError("wildtype not all in `charlist`")
if (prefs.site.sort_values() != wts.site.sort_values()).any():
raise ValueError("`prefs` and `wts` do not have same sites")
assert 'site' in muteffects.columns
assert 'wildtype' not in muteffects.columns
muteffects_from_wt = (
muteffects
.merge(wts[['site', 'wildtype']], on='site')
.query('initial == wildtype')
.drop(columns=['initial'])
.rename(columns={'final':'mutant'})
[['site', 'wildtype', 'mutant', 'mutation',
'effect', 'log2effect']]
.sort_values(['site', 'mutant'])
.reset_index(drop=True)
)
return muteffects_from_wt
[docs]def prefsToMutEffects(prefs, charlist):
r"""Converts amino acid preferences to effects of specific mutations.
If the preference of site :math:`r` for amino acid :math:`a` is
:math:`\pi_{r,a}`, then the estimated effect (e.g., ratio of
enrichment ratios) for mutating that site from :math:`x` to
:math:`y` is :math:`\frac{\pi_{r,y}}{\pi_{r,x}}`. Very small
values indicate disfavored mutations, and very large values
indicate highly favored mutations. The logarithm base 2 of the
expected effects is also a useful measure -- negative values
are disfavored mutations, positive values are favored ones.
Args:
`prefs` (pandas DataFrame)
Preferences to analyze. The columns should be `site` and
a column for every character in `charlist`.
`charlist` (list)
The list of characters that we are analyzing. For instance,
`dms_tools2.AAS` for amino acids.
Returns:
A pandas Data Frame where the columns are:
* `site`: the site
* `initial`: the initial character (e.g., amino acid)
* `final`: the final character after the mutation
* `mutation`: mutation in the string form `A1G`
* `effect`: the effect of the mutation
* `log2effect`: the log2 of the effect of the mutation.
>>> charlist = ['A', 'C', 'G']
>>> prefs = pandas.DataFrame({
... 'site':[1, 2],
... 'A':[0.25, 0.25],
... 'C':[0.25, 0.5],
... 'G':[0.5, 0.25],
... })
>>> effects = prefsToMutEffects(prefs, charlist)
>>> set(effects.columns) == {'site', 'initial', 'final',
... 'mutation', 'effect', 'log2effect'}
True
>>> numpy.allclose(effects[effects['initial'] == 'A']['effect'],
... [1, 1, 2, 1, 2, 1])
True
>>> numpy.allclose(effects[effects['initial'] == 'C']['effect'],
... [1, 1, 2, 0.5, 1, 0.5])
True
>>> numpy.allclose(effects['effect'], 2**effects['log2effect'])
True
"""
if not (set(prefs.columns) <= set(['site'] + charlist)):
raise ValueError("`prefs` contain extra columns")
initial = prefs.melt(id_vars='site', value_vars=charlist,
var_name='initial', value_name='initial_pref')
final = initial.rename(columns=
{'initial':'final', 'initial_pref':'final_pref'})
effects = final.merge(initial, on='site')
effects['effect'] = effects['final_pref'] / effects['initial_pref']
effects['log2effect'] = numpy.log2(effects['effect'])
effects['mutation'] = (effects['initial'] +
effects['site'].map(str) + effects['final'])
return (effects
.drop(['final_pref', 'initial_pref'], axis=1)
.sort_values(['site', 'initial', 'final'])
[['site', 'initial', 'final', 'mutation', 'effect', 'log2effect']]
.reset_index(drop=True)
)
[docs]def rescalePrefs(prefs, stringency):
r"""Re-scale amino acid preferences by stringency parameter.
If the initial preference of site :math:`r` for amino-acid
:math:`a` is :math:`\pi_{r,a}`, then the re-scaled preference is
.. math::
\frac{\left(\pi_{r,a}\right)^{\beta}}{\sum_{a'} \left(\pi_{r,a'}\right)}
where :math:`\beta` is the stringency parameter.
Args:
`prefs` (pandas.DataFrame)
Columns are 'site' and then every character (e.g.,
amino acid).
`stringency` (float >= 0)
The stringency parameter.
Returns:
A data frame in the same format as `prefs` but with the
re-scaled preferences.
>>> prefs = pandas.DataFrame(dict(
... [('site', [1, 2]), ('A', [0.24, 0.03]), ('C', [0.04, 0.43])]
... + [(aa, [0.04, 0.03]) for aa in dms_tools2.AAS if
... aa not in ['A', 'C']]))
>>> numpy.allclose(1, prefs.drop('site', axis=1).sum(axis=1))
True
>>> rescaled = rescalePrefs(prefs, 1.0)
>>> all([numpy.allclose(prefs[c], rescaled[c]) for c in prefs.columns])
True
>>> rescaled2 = rescalePrefs(prefs, 2.0)
>>> all(rescaled2['site'] == prefs['site'])
True
>>> numpy.allclose(rescaled2['A'], [0.6545, 0.0045], atol=1e-3)
True
>>> numpy.allclose(rescaled2['C'], [0.0182, 0.9153], atol=1e-3)
True
"""
assert set(prefs.drop('site', axis=1).select_dtypes(
include=[numpy.number]).columns) == set(
prefs.drop('site', axis=1).columns), ('Non-numeric '
'columns other than "site"')
chars = prefs.drop('site', axis=1).columns
rescaled = prefs[chars].pow(stringency)
rescaled = rescaled.divide(rescaled.sum(axis=1), axis='index')
rescaled['site'] = prefs['site']
return rescaled[prefs.columns]
[docs]def avgPrefs(prefsfiles):
"""Gets average of site-specific preferences.
Args:
`prefsfiles` (list)
List of CSV files containing preferences, must all be
for same sites and characters.
Returns:
A `pandas.DataFrame` containing the average of the
preferences in `prefsfiles`. In this returned
data frame, `site` is the index
>>> tf1 = tempfile.NamedTemporaryFile
>>> tf2 = tempfile.NamedTemporaryFile
>>> with tf1(mode='w') as file1, tf2(mode='w') as file2:
... x = file1.write('site,A,C,G,T\\n'
... '10,0.2,0.2,0.5,0.1\\n'
... '2a,0.3,0.3,0.3,0.1')
... file1.flush()
... x = file2.write('site,A,C,G,T\\n'
... '10,0.4,0.1,0.1,0.4\\n'
... '2a,0.3,0.4,0.1,0.2')
... file2.flush()
... avg = avgPrefs([file1.name, file2.name])
>>> (avg['site'] == ['2a', '10']).all()
True
>>> numpy.allclose(avg['A'], [0.3, 0.3])
True
>>> numpy.allclose(avg['C'], [0.35, 0.15])
True
>>> numpy.allclose(avg['G'], [0.2, 0.3])
True
>>> numpy.allclose(avg['T'], [0.15, 0.25])
True
"""
assert len(prefsfiles) >= 1
prefs = [pandas.read_csv(f, index_col='site').sort_index()
for f in prefsfiles]
# make sure all have the same columns in the same order
cols = prefs[0].columns
for i in range(len(prefs)):
assert set(cols) == set(prefs[i].columns)
prefs[i] = prefs[i][cols]
avgprefs = pandas.concat(prefs).groupby('site').mean().reset_index()
# natural sort by site: https://stackoverflow.com/a/29582718
avgprefs = avgprefs.reindex(index=natsort.order_by_index(avgprefs.index,
natsort.index_realsorted(avgprefs.site)))
return avgprefs
[docs]def prefsEntropy(prefs, charlist):
r"""Calculate site entropy and number of effective characters.
The site entropy :math:`h_r` for site :math:`r` is defined as
:math:`h_r = -\sum_{x} \pi_{r,x} \log\left(\pi_{r,x}\right)`
where :math:`\pi_{r,x}` is the preference for character (e.g.,
amino acid) :math:`x` at site :math:`r`, and the log is
the natural logarithm.
The number of effective characters at site :math:`r`
is :math:`N_{\rm{eff}, r} = \exp\left(h_r\right)`.
Args:
`prefs` (pandas DataFrame)
Data frame with the preferences. Should have a column
named `site` and a column for each character in
`charlist`.
`charlist` (list)
List of the characters of interest, for example
the amino acids.
Returns:
A copy of `prefs` that has additional columns named
`entropy` and `neffective`, giving the site entropy and
number of effective characters for each site. For
the entropies, log is taken to base :math:`e`.
>>> charlist = ['A', 'C', 'G', 'T']
>>> prefs = pandas.DataFrame({
... 'site':[1, 2, 3],
... 'A':[0.25, 0.6, 1 / 3.],
... 'C':[0.25, 0.3, 1 / 3.],
... 'G':[0.25, 0.1, 1 / 3.],
... 'T':[0.25, 0.0, 0.0],
... })
>>> prefs_entropy = prefsEntropy(prefs, charlist)
>>> (set(['entropy', 'neffective', 'site'] + charlist) ==
... set(prefs_entropy.columns))
True
>>> h2 = -0.6 * math.log(0.6) - 0.3 * math.log(0.3) - 0.1 * math.log(0.1)
>>> numpy.allclose(prefs_entropy['entropy'], [math.log(4), h2, math.log(3)])
True
>>> numpy.allclose(prefs_entropy['neffective'], [4, math.exp(h2), 3])
True
"""
assert 'site' in prefs.columns, "prefs does not have `site` column"
assert set(charlist) < set(prefs.columns), "cols do not contain charlist"
assert charlist, "no characters specified"
assert numpy.allclose(prefs[charlist].sum(axis=1), 1, atol=1e-4),\
"prefs do not sum to one"
prefs_entropy = prefs.copy()
prefs_entropy['entropy'] = 0
for char in charlist:
p_char = prefs_entropy[char].where(prefs_entropy[char] > 0, 1)
prefs_entropy['entropy'] -= numpy.log(p_char) * p_char
prefs_entropy['neffective'] = numpy.exp(prefs_entropy['entropy'])
return prefs_entropy
[docs]def aafreqsFromAlignment(alignmentfile, codon_to_aa,
ignore_gaps=True, ignore_stop=True):
"""Get amino-acid frequencies at each site in alignment.
Args:
`alignmentfile` (str)
FASTA file with alignment of proteins or coding sequences.
`codon_to_aa` (bool)
If `True`, translate codon alignment to amino acids.
`ignore_gaps` (bool)
Ignore gaps when calculating frequencies.
`ignore_stop` (bool)
Ignore stop codons when calculating frequencies.
Returns:
A `pandas.DataFrame` with columns being `site` (1, 2, ...
numbering) and other columns being amino acids and values
giving frequencies in alignment.
>>> with tempfile.NamedTemporaryFile(mode='w') as f:
... x = f.write('>seq1\\n'
... 'ATGGGGCAG\\n'
... '>seq2\\n'
... '---AGGCAG\\n'
... '>seq3\\n'
... 'ATGTGACAG')
... f.flush()
... aafreqs = aafreqsFromAlignment(f.name, codon_to_aa=True)
>>> aas_counts = ['M', 'G', 'R', 'Q']
>>> aas_nocounts = [a for a in dms_tools2.AAS if a not in aas_counts]
>>> (0 == aafreqs[aas_nocounts].values).all()
True
>>> expected_counts = pandas.DataFrame.from_dict(
... collections.OrderedDict([
... ('site', [1, 2, 3]), ('M', [1.0, 0.0, 0.0]),
... ('G', [0.0, 0.5, 0]), ('R', [0.0, 0.5, 0.0]),
... ('Q', [0.0, 0.0, 1.0])]))
>>> expected_counts.equals(aafreqs[['site'] + aas_counts])
True
"""
# read sequences
seqs = [s.seq for s in Bio.SeqIO.parse(alignmentfile, 'fasta')]
if codon_to_aa:
seqs = [s.translate(gap='-', stop_symbol='*') for s in seqs]
assert seqs, "No sequences"
seqlen = len(seqs[0])
assert seqlen, "sequences have no length"
assert all([seqlen == len(s) for s in seqs]), "seqs not same length"
# get character sets
aas = dms_tools2.AAS.copy()
skipchars = []
if ignore_gaps:
skipchars.append('-')
else:
aas.append('-')
if ignore_stop:
skipchars.append('*')
else:
aas.append('*')
# tally amino-acid frequencies
aafreqs = dict([(col, [0] * seqlen) for col in aas])
aafreqs['site'] = list(range(1, seqlen + 1))
for s in seqs:
for (r, aa) in enumerate(s):
if aa in skipchars:
continue
else:
aafreqs[aa][r] += 1
# convert to dataframe and change counts to freqs
aafreqs = pandas.DataFrame(aafreqs)
ncounts = aafreqs[aas].sum(axis=1).astype('float')
for aa in aas:
aafreqs[aa] = aafreqs[aa] / ncounts
return aafreqs[['site'] + aas].fillna(0)
if __name__ == '__main__':
import doctest
doctest.testmod()