Source code for dms_tools2.syn_selection

"""
==============
syn_selection
==============

Identifies enrichment of synonymous codons.

"""

import pandas as pd
import numpy as np
import scipy.stats

from dms_tools2 import CODON_TO_AA

[docs]def syn_selection_by_codon(counts_pre, counts_post, pseudocount=0.5): """Identify sites with selection on synonymous codons. Runs two-tailed Fisher Exact test, which returns: - P-value reflecting the significance of codon_x enrichment. After calculating the Fisher P-value, a pseudocount is added for calculation of the odds ratio, which reflects the enrichment of codon_x after selection (relative to other synonymous codons). - The pseudocount removes 0's to avoid returning NA or inf odds ratios. - Rows where only one codon is represented pre-selection are dropped. Args: `counts_pre` (str or pandas.DataFrame) CSV file giving pre-selection codon counts with columns named 'site', 'wildtype', and list of codons. Can also be a pandas DataFrame containing the CSV file. `counts_post` (str or pandas.DataFrame) Like `counts_pre` but for the post-selection counts. CSV file giving post-selection codon counts in same format as `counts_pre`. 'pseudocount' (float or int, default 0.5) Number to add to each codon count before calculating the odds ratio. Returns: A pandas DataFrame with the following columns: - 'site' - 'wildtype' : wildtype codon at site - 'codon' : codon we are analyzing at site - 'aa' : amino acid - 'codon_pre' : counts for codon of interest pre-selection - 'aa_pre' : counts for all codons for amino acid pre-selection - 'codon_post' : counts for codon of interest post-selection - 'aa_post' : counts for all codons for amino acid post-selection - 'odds_ratio' : enrichment of codon post-selection - 'P' : P-value calculated using Fisher's exact test Example: >>> pd.set_option('display.max_columns', None) # display all columns >>> pd.set_option('expand_frame_repr', False) # do not break lines >>> counts_pre = pd.DataFrame.from_records( ... [(1, 'ATC', 5, 100, 10), ... (2, 'ATT', 50, 10, 10), ... ], ... columns=['site', 'wildtype', 'ATT', 'ATC', 'ATA'], ... ) >>> counts_post = pd.DataFrame.from_records( ... [(1, 'ATC', 5, 50, 75), ... (2, 'ATT', 50, 9, 11), ... ], ... columns=['site', 'wildtype', 'ATT', 'ATC', 'ATA'], ... ) >>> syn_selection_by_codon(counts_pre, counts_post, 1) site wildtype codon aa codon_pre aa_pre codon_post aa_post odds_ratio P 0 1 ATC ATT I 6 118 6 133 0.881890 1.000000e+00 1 1 ATC ATC I 101 118 51 133 0.104685 1.798192e-15 2 1 ATC ATA I 11 118 76 133 12.969697 7.500709e-17 3 2 ATT ATT I 51 73 51 73 1.000000 1.000000e+00 4 2 ATT ATC I 11 73 10 73 0.894661 1.000000e+00 5 2 ATT ATA I 11 73 12 73 1.108793 1.000000e+00 """ # translate codons and calculate aa_count for pre- and post-selection datasets counts_df_list = [] for df, df_type in [(counts_pre, 'pre'), (counts_post, 'post')]: df = ( df.melt(id_vars=['site', 'wildtype'], var_name='codon', value_name=f"codon_{df_type}") .assign(aa=lambda x: x['codon'].map(CODON_TO_AA)) # translate codons [['site', 'wildtype', 'codon', 'aa', f"codon_{df_type}"]] ) # group synonymous codons at each site aaGroups = df.groupby(['site', 'aa']) df = ( df.assign(aa_count=aaGroups[f"codon_{df_type}"].transform(np.sum)) # use in calculating significance of codon vs other synonymous codons .assign(syn_codons=lambda x: x['aa_count'] - x[f"codon_{df_type}"]) .rename(columns={'aa_count': f"aa_{df_type}", 'syn_codons': f"syn_codons_{df_type}"}) ) counts_df_list.append(df) # merge counts_pre and counts_post df_merge = pd.merge(counts_df_list[0], counts_df_list[1], on=['site', 'wildtype', 'codon', 'aa']) # apply Fisher's exact test to codon x vs other synonymous codons df_merge['fishers'] = ( df_merge.apply(lambda x: scipy.stats.fisher_exact( [[x.syn_codons_pre, x.syn_codons_post], [x.codon_pre, x.codon_post]]), axis=1) ) # split fishers results into 'odds_ratio' and 'P', and drop 'odds_ratio' new_col_list = ['odds_ratio', 'P'] for n, col in enumerate(new_col_list): df_merge[col] = df_merge['fishers'].map(lambda x: x[n]) # drop columns for recalculation with pseudocount df_merge = df_merge.drop(['odds_ratio', 'fishers', 'aa_pre', 'syn_codons_pre', 'aa_post', 'syn_codons_post'], axis=1) for df_type in ['pre', 'post']: # add pseudocount df_merge[f'codon_{df_type}'] += pseudocount # recalculate aa and syn_codon sums with pseudocount aaGroups = df_merge.groupby(['site', 'aa']) df_merge = ( df_merge.assign(aa_count=aaGroups[f"codon_{df_type}"].transform(np.sum)) # use in calculating significance of codon vs other synonymous codons .assign(syn_codons=lambda x: x['aa_count'] - x[f"codon_{df_type}"]) .rename(columns={'aa_count': f"aa_{df_type}", 'syn_codons': f"syn_codons_{df_type}"}) ) # drop rows where syn_codons_pre = 0 df_merge = df_merge[df_merge.syn_codons_pre != 0] # calculate odds ratio with pseudocount df_merge['odds_ratio'] = ( df_merge.apply(lambda x: ((x.codon_post / x.syn_codons_post) / (x.codon_pre / x.syn_codons_pre)), axis=1) ) # drop extra columns df = (df_merge .drop(['syn_codons_pre', 'syn_codons_post'], axis=1) .sort_values(['site', 'aa']) .reset_index(drop=True) [['site', 'wildtype', 'codon', 'aa', 'codon_pre', 'aa_pre', 'codon_post', 'aa_post', 'odds_ratio', 'P']] ) return df
if __name__ == '__main__': import doctest doctest.testmod()