Source code for dms_tools2.syn_selection

"""
==============
syn_selection
==============

Identifies enrichment of synonymous codons.

"""

import pandas as pd
import numpy as np
import scipy.stats

from dms_tools2 import CODON_TO_AA

[docs]def syn_selection_by_codon(counts_pre, counts_post, pseudocount=0.5):
    """Identify sites with selection on synonymous codons.

    Runs two-tailed Fisher Exact test, which returns:
     - P-value reflecting the significance of codon_x enrichment.

    After calculating the Fisher P-value, a pseudocount is added
    for calculation of the odds ratio, which reflects the enrichment of
    codon_x after selection (relative to other synonymous codons).
     - The pseudocount removes 0's to avoid returning NA or inf odds ratios.
     - Rows where only one codon is represented pre-selection are dropped.

    Args:
        `counts_pre` (str or pandas.DataFrame)
            CSV file giving pre-selection codon counts with columns
            named 'site', 'wildtype', and list of codons. Can also
            be a pandas DataFrame containing the CSV file.
        `counts_post` (str or pandas.DataFrame)
            Like `counts_pre` but for the post-selection counts.
            CSV file giving post-selection codon counts in same format
            as `counts_pre`.
        'pseudocount' (float or int, default 0.5)
            Number to add to each codon count before calculating the odds
            ratio.

    Returns:
        A pandas DataFrame with the following columns:
         - 'site'
         - 'wildtype' : wildtype codon at site
         - 'codon' : codon we are analyzing at site
         - 'aa' : amino acid
         - 'codon_pre' : counts for codon of interest pre-selection
         - 'aa_pre' : counts for all codons for amino acid pre-selection
         - 'codon_post' : counts for codon of interest post-selection
         - 'aa_post' : counts for all codons for amino acid post-selection
         - 'odds_ratio' : enrichment of codon post-selection
         - 'P' : P-value calculated using Fisher's exact test

    Example:

    >>> pd.set_option('display.max_columns', None)  # display all columns
    >>> pd.set_option('expand_frame_repr', False)  # do not break lines
    >>> counts_pre = pd.DataFrame.from_records(
    ...         [(1, 'ATC', 5, 100, 10),
    ...          (2, 'ATT', 50, 10, 10),
    ...          ],
    ...         columns=['site', 'wildtype', 'ATT', 'ATC', 'ATA'],
    ...         )
    >>> counts_post = pd.DataFrame.from_records(
    ...         [(1, 'ATC', 5, 50, 75),
    ...          (2, 'ATT', 50, 9, 11),
    ...          ],
    ...         columns=['site', 'wildtype', 'ATT', 'ATC', 'ATA'],
    ...         )
    >>> syn_selection_by_codon(counts_pre, counts_post, 1)
       site wildtype codon aa  codon_pre  aa_pre  codon_post  aa_post  odds_ratio             P
    0     1      ATC   ATT  I          6     118           6      133    0.881890  1.000000e+00
    1     1      ATC   ATC  I        101     118          51      133    0.104685  1.798192e-15
    2     1      ATC   ATA  I         11     118          76      133   12.969697  7.500709e-17
    3     2      ATT   ATT  I         51      73          51       73    1.000000  1.000000e+00
    4     2      ATT   ATC  I         11      73          10       73    0.894661  1.000000e+00
    5     2      ATT   ATA  I         11      73          12       73    1.108793  1.000000e+00

    """

    # translate codons and calculate aa_count for pre- and post-selection datasets
    counts_df_list = []
    for df, df_type in [(counts_pre, 'pre'), (counts_post, 'post')]:
        df = (
         df.melt(id_vars=['site', 'wildtype'],
                 var_name='codon',
                 value_name=f"codon_{df_type}")
         .assign(aa=lambda x: x['codon'].map(CODON_TO_AA))  # translate codons
         [['site', 'wildtype', 'codon', 'aa', f"codon_{df_type}"]]
         )

        # group synonymous codons at each site
        aaGroups = df.groupby(['site', 'aa'])

        df = (
         df.assign(aa_count=aaGroups[f"codon_{df_type}"].transform(np.sum))
         # use in calculating significance of codon vs other synonymous codons
         .assign(syn_codons=lambda x: x['aa_count'] - x[f"codon_{df_type}"])
         .rename(columns={'aa_count': f"aa_{df_type}",
                          'syn_codons': f"syn_codons_{df_type}"})
         )

        counts_df_list.append(df)

    # merge counts_pre and counts_post
    df_merge = pd.merge(counts_df_list[0],
                        counts_df_list[1],
                        on=['site', 'wildtype', 'codon', 'aa'])

    # apply Fisher's exact test to codon x vs other synonymous codons
    df_merge['fishers'] = (
        df_merge.apply(lambda x: scipy.stats.fisher_exact(
            [[x.syn_codons_pre, x.syn_codons_post],
             [x.codon_pre, x.codon_post]]), axis=1)
    )

    # split fishers results into 'odds_ratio' and 'P', and drop 'odds_ratio'
    new_col_list = ['odds_ratio', 'P']
    for n, col in enumerate(new_col_list):
        df_merge[col] = df_merge['fishers'].map(lambda x: x[n])
    # drop columns for recalculation with pseudocount
    df_merge = df_merge.drop(['odds_ratio', 'fishers', 'aa_pre', 'syn_codons_pre',
                              'aa_post', 'syn_codons_post'], axis=1)

    for df_type in ['pre', 'post']:
        # add pseudocount
        df_merge[f'codon_{df_type}'] += pseudocount
        # recalculate aa and syn_codon sums with pseudocount
        aaGroups = df_merge.groupby(['site', 'aa'])
        df_merge = (
            df_merge.assign(aa_count=aaGroups[f"codon_{df_type}"].transform(np.sum))
                # use in calculating significance of codon vs other synonymous codons
                .assign(syn_codons=lambda x: x['aa_count'] - x[f"codon_{df_type}"])
                .rename(columns={'aa_count': f"aa_{df_type}",
                                 'syn_codons': f"syn_codons_{df_type}"})
        )

    # drop rows where syn_codons_pre = 0
    df_merge = df_merge[df_merge.syn_codons_pre != 0]

    # calculate odds ratio with pseudocount
    df_merge['odds_ratio'] = (
        df_merge.apply(lambda x:
                       ((x.codon_post / x.syn_codons_post) /
                        (x.codon_pre / x.syn_codons_pre)), axis=1)
    )

    # drop extra columns
    df = (df_merge
        .drop(['syn_codons_pre', 'syn_codons_post'], axis=1)
        .sort_values(['site', 'aa'])
        .reset_index(drop=True)
    [['site', 'wildtype', 'codon', 'aa', 'codon_pre', 'aa_pre', 'codon_post',
      'aa_post', 'odds_ratio', 'P']]
        )

    return df


if __name__ == '__main__':
    import doctest
    doctest.testmod()
Source code for dms_tools2.syn_selection

dms_tools2

Navigation

Related Topics