Source code for dms_tools2.codonvarianttable

"""
=================
codonvarianttable
=================

This module defines :class:`CodonVariantTable` objects
for storing and handling codon variants of a gene. They are
useful when you are performing deep mutational scanning
on barcoded codon-mutant libraries of a gene.
"""

import re
import os
import math
import collections
import itertools
import tempfile
import random

import scipy
import pandas as pd
import Bio.SeqUtils.ProtParamData
import gpmap
import numpy as np

# use plotnine for plotting
from plotnine import *

from dms_tools2.plot import latexSciNot
from dms_tools2 import (CODON_TO_AA,
                        CODONS,
                        AAS_WITHSTOP,
                        AA_TO_CODONS,
                        NTS
                        )

#: `color-blind safe palette <http://bconnelly.net/2013/10/creating-colorblind-friendly-figures/>`_
CBPALETTE = ["#999999", "#E69F00", "#56B4E9", "#009E73",
             "#F0E442", "#0072B2", "#D55E00", "#CC79A7"]


[docs]class CodonVariantTable: """Associates barcodes with codon mutants of gene. Args: `barcode_variant_file` (str) CSV file giving barcodes and variants. Must have columns named "library", "barcode", "substitutions", (nucleotide mutations in 1, ... numbering in a format like "G301A A302T G856C"), and "variant_call_support" (sequences supporting barcode-variant call). Any additional columns are removed unless they are specified in `extra_cols`. `geneseq` (str) Sequence of wildtype protein-coding gene. `substitutions_are_codon` (bool) If `True`, then the "substitutions" column in `barcode_variant_file` gives the substitutions as codon rather than nucleotide mutations (e.g., "ATG1ATA GTA5CCC" for substitutions at codons 1 and 5 in 1, 2, ... numbering). `extra_cols` (list) Additional columns in `barcode_variant_file` to retain when creating `barcode_variant_df` and `variant_count_df` attributes. Attributes: `geneseq` (str) `geneseq` passed at initialization. `sites` (list) List of all codon sites in 1, 2, ... numbering. `codons` (OrderedDict) `codons[r]` is wildtype codon at site `r`, ordered by sites. `aas` (OrderedDict) `aas[r]` is wildtype amino acid at site `r`, ordered by sites. `libraries` (list) List of libraries in `barcode_variantfile`. `barcode_variant_df` (pandas DataFrame) Info about codon mutations parsed from `barcode_variantfile`. `variant_count_df` (pandas DataFrame or None) Initially `None`, but after data are added with :class:`CodonVariantTable.addSampleCounts`, holds counts of each variant for each sample. Differs from `barcode_variant_df` in that the former just holds initial barcode-variant data, whereas `variant_count_df` is updated with variant counts for samples. Here is an example. First, initialize a :class:`CodonVariantTable`: >>> geneseq = 'ATGGGATGA' >>> variantfile = '_variantfile.csv' >>> with open(variantfile, 'w') as f: ... _ = f.write( ... 'library,barcode,substitutions,variant_call_support\\n' ... 'lib_1,AAC,,2\\n' ... 'lib_1,GAT,G4C A6C,1\\n' ... 'lib_2,AAC,T2A G4T,2\\n' ... 'lib_2,CAT,A6C,3' ... ) >>> variants = CodonVariantTable( ... barcode_variant_file=variantfile, ... geneseq=geneseq ... ) >>> os.remove(variantfile) Check attributes of the :class:`CodonVariantTable`: >>> variants.sites [1, 2, 3] >>> variants.codons == {1:'ATG', 2:'GGA', 3:'TGA'} True >>> variants.aas == {1:'M', 2:'G', 3:'*'} True >>> variants.libraries ['lib_1', 'lib_2'] >>> variants.valid_barcodes('lib_1') == {'AAC', 'GAT'} True >>> variants.valid_barcodes('lib_2') == {'AAC', 'CAT'} True >>> pd.set_option('display.max_columns', 15) >>> pd.set_option('display.width', 500) >>> variants.barcode_variant_df library barcode variant_call_support codon_substitutions aa_substitutions n_codon_substitutions n_aa_substitutions 0 lib_1 AAC 2 0 0 1 lib_1 GAT 1 GGA2CGC G2R 1 1 2 lib_2 AAC 2 ATG1AAG GGA2TGA M1K G2* 2 2 3 lib_2 CAT 3 GGA2GGC 1 0 We can also look at the number of variants; we get this by calling :class:`CodonVariantTable.n_variants_df` with `samples=None` since we don't have samples, and just want stats across barcoded variants: >>> variants.n_variants_df(samples=None) library sample count 0 lib_1 barcoded variants 2 1 lib_2 barcoded variants 2 2 all libraries barcoded variants 4 Here is the number of variants if we look at just single amino-acid mutants (and wildtype): >>> variants.n_variants_df(samples=None, variant_type='single', mut_type='aa') library sample count 0 lib_1 barcoded variants 2 1 lib_2 barcoded variants 1 2 all libraries barcoded variants 3 We can also see how these numbers change if we require a variant call support of at least 2: >>> variants.n_variants_df(samples=None, min_support=2) library sample count 0 lib_1 barcoded variants 1 1 lib_2 barcoded variants 2 2 all libraries barcoded variants 3 If we want to combine the data for the two libraries, we can use :class:`CodonVariantTable.addMergedLibraries`, which creates a new combined library called "all libraries": >>> variants.addMergedLibraries(variants.barcode_variant_df) library barcode variant_call_support codon_substitutions aa_substitutions n_codon_substitutions n_aa_substitutions 0 lib_1 AAC 2 0 0 1 lib_1 GAT 1 GGA2CGC G2R 1 1 2 lib_2 AAC 2 ATG1AAG GGA2TGA M1K G2* 2 2 3 lib_2 CAT 3 GGA2GGC 1 0 4 all libraries lib_1-AAC 2 0 0 5 all libraries lib_1-GAT 1 GGA2CGC G2R 1 1 6 all libraries lib_2-AAC 2 ATG1AAG GGA2TGA M1K G2* 2 2 7 all libraries lib_2-CAT 3 GGA2GGC 1 0 Note however that :class:`CodonVariantTable.addMergedLibraries` doesn't do anything if there is only one library: >>> variants.addMergedLibraries(variants.barcode_variant_df ... .query('library == "lib_1"')) library barcode variant_call_support codon_substitutions aa_substitutions n_codon_substitutions n_aa_substitutions 0 lib_1 AAC 2 0 0 1 lib_1 GAT 1 GGA2CGC G2R 1 1 Count number of barcoded variants with each mutation: >>> variants.mutCounts('all', 'aa', samples=None)[ : 2] library sample mutation count mutation_type site 0 lib_1 barcoded variants G2R 1 nonsynonymous 2 1 lib_1 barcoded variants *3A 0 nonsynonymous 3 We can do the same for codon mutations (here for only a single library), first for all variants: >>> variants.mutCounts('all', 'codon', samples=None, ... libraries=['lib_2'])[ : 4] library sample mutation count mutation_type site 0 lib_2 barcoded variants ATG1AAG 1 nonsynonymous 1 1 lib_2 barcoded variants GGA2GGC 1 synonymous 2 2 lib_2 barcoded variants GGA2TGA 1 stop 2 3 lib_2 barcoded variants ATG1AAA 0 nonsynonymous 1 Then for just single-mutant variants: >>> variants.mutCounts('single', 'codon', samples=None, ... libraries=['lib_2'])[ : 3] library sample mutation count mutation_type site 0 lib_2 barcoded variants GGA2GGC 1 synonymous 2 1 lib_2 barcoded variants ATG1AAA 0 nonsynonymous 1 2 lib_2 barcoded variants ATG1AAC 0 nonsynonymous 1 Initially we haven't added any barcode count information for any samples: >>> all(variants.samples(lib) == [] for lib in variants.libraries) True >>> variants.variant_count_df is None True Now we add barcode count information for sample "input" from library 1 using :class:`CodonVariantTable.addSampleCounts`: >>> counts_lib1_input = pd.DataFrame( ... {'barcode':['AAC', 'GAT'], ... 'count' :[ 253, 1101]}) >>> variants.addSampleCounts('lib_1', 'input', counts_lib1_input) >>> variants.variant_count_df barcode count library sample variant_call_support codon_substitutions aa_substitutions n_codon_substitutions n_aa_substitutions 0 GAT 1101 lib_1 input 1 GGA2CGC G2R 1 1 1 AAC 253 lib_1 input 2 0 0 We get an error if we try to add these same data again, as they are already added for that sample to that library: >>> variants.addSampleCounts('lib_1', 'input', counts_lib1_input) Traceback (most recent call last): ... ValueError: `library` lib_1 already has `sample` input But, we can add barcode counts for another sample (named "selected" in this case) to library 1: >>> counts_lib1_selected = pd.DataFrame( ... {'barcode':['AAC', 'GAT'], ... 'count' :[ 513, 401]}) >>> variants.addSampleCounts('lib_1', 'selected', counts_lib1_selected) As well as barcode counts for the same two samples ("input" and "selected") to our other library (library 2): >>> counts_lib2_input = pd.DataFrame( ... {'barcode':['AAC', 'CAT'], ... 'count' :[ 1253, 923]}) >>> variants.addSampleCounts('lib_2', 'input', counts_lib2_input) >>> counts_lib2_selected = pd.DataFrame( ... {'barcode':['AAC', 'CAT'], ... 'count' :[ 113, 1200]}) >>> variants.addSampleCounts('lib_2', 'selected', counts_lib2_selected) >>> variants.variant_count_df barcode count library sample variant_call_support codon_substitutions aa_substitutions n_codon_substitutions n_aa_substitutions 0 GAT 1101 lib_1 input 1 GGA2CGC G2R 1 1 1 AAC 253 lib_1 input 2 0 0 2 AAC 513 lib_1 selected 2 0 0 3 GAT 401 lib_1 selected 1 GGA2CGC G2R 1 1 4 AAC 1253 lib_2 input 2 ATG1AAG GGA2TGA M1K G2* 2 2 5 CAT 923 lib_2 input 3 GGA2GGC 1 0 6 CAT 1200 lib_2 selected 3 GGA2GGC 1 0 7 AAC 113 lib_2 selected 2 ATG1AAG GGA2TGA M1K G2* 2 2 We can also use :class:`CodonVariantTable.mutCounts` to look at total counts of each mutation: >>> variants.mutCounts('all', 'aa')[ : 2] library sample mutation count mutation_type site 0 lib_1 input G2R 1101 nonsynonymous 2 1 lib_1 input *3A 0 nonsynonymous 3 >>> variants.mutCounts('all', 'aa', libraries=['lib_2'])[ : 3] library sample mutation count mutation_type site 0 lib_2 input G2* 1253 stop 2 1 lib_2 input M1K 1253 nonsynonymous 1 2 lib_2 input *3A 0 nonsynonymous 3 We can use :class:`CodonVariantTable.writeCodonCounts` to write codon count files. First for only **single** mutants: >>> with tempfile.TemporaryDirectory() as tmpdir: ... countfiles = variants.writeCodonCounts("single", outdir=tmpdir, include_all_libs=True) ... lib1_input = pd.read_csv(f'{tmpdir}/lib_1_input_codoncounts.csv') ... all_sel = pd.read_csv(f'{tmpdir}/all-libraries_selected_codoncounts.csv') Make sure we created expected countfiles: >>> countfiles.assign(countfile=lambda x: x.countfile.apply(os.path.basename)) library sample countfile 0 lib_1 input lib_1_input_codoncounts.csv 1 lib_1 selected lib_1_selected_codoncounts.csv 2 lib_2 input lib_2_input_codoncounts.csv 3 lib_2 selected lib_2_selected_codoncounts.csv 4 all-libraries input all-libraries_input_codoncounts.csv 5 all-libraries selected all-libraries_selected_codoncounts.csv Check for expected values in a few of these counts files, only showing columns with non-zero entries: >>> lib1_input.iloc[:, (lib1_input != 0).any(axis='rows').values] site wildtype ATG CGC GGA TGA 0 1 ATG 253 0 0 0 1 2 GGA 0 1101 253 0 2 3 TGA 0 0 0 253 >>> all_sel.iloc[:, (all_sel != 0).any(axis='rows').values] site wildtype ATG CGC GGA GGC TGA 0 1 ATG 513 0 0 0 0 1 2 GGA 0 401 513 1200 0 2 3 TGA 0 0 0 0 513 Now write codon counts files for **all** mutants: >>> with tempfile.TemporaryDirectory() as tmpdir: ... _ = variants.writeCodonCounts("all", outdir=tmpdir, include_all_libs=True) ... lib1_input_all = pd.read_csv(f'{tmpdir}/lib_1_input_codoncounts.csv') ... all_sel_all = pd.read_csv(f'{tmpdir}/all-libraries_selected_codoncounts.csv') >>> lib1_input_all.iloc[:, (lib1_input_all != 0).any(axis='rows').values] site wildtype ATG CGC GGA TGA 0 1 ATG 1354 0 0 0 1 2 GGA 0 1101 253 0 2 3 TGA 0 0 0 1354 >>> all_sel_all.iloc[:, (all_sel_all != 0).any(axis='rows').values] site wildtype AAG ATG CGC GGA GGC TGA 0 1 ATG 113 2114 0 0 0 0 1 2 GGA 0 0 401 513 1200 113 2 3 TGA 0 0 0 0 0 2227 We can also initialize a :class:`CodonVariantTable` from the `variant_count_df` if we have written that to a CSV file. We do this using :meth:`CodonVariantTable.from_variant_count_df`. The example below shows how this newly initialized variant table is equal to the original one used to write the CSV file: >>> with tempfile.NamedTemporaryFile(mode='w') as f: ... variants.variant_count_df.to_csv(f, index=False) ... f.flush() ... variants_eq = CodonVariantTable.from_variant_count_df( ... variant_count_df_file=f.name, ... geneseq=geneseq) >>> variants == variants_eq True Of course, the initialized variant table is **not** equal to original one if we don't write the full `variant_count_df` to the CSV file: >>> with tempfile.NamedTemporaryFile(mode='w') as f: ... (variants ... .variant_count_df.query('sample == "input"') ... .to_csv(f, index=False) ... ) ... f.flush() ... variants_ne = CodonVariantTable.from_variant_count_df( ... variant_count_df_file=f.name, ... geneseq=geneseq) >>> variants == variants_ne False We can use :meth:`CodonVariantTable.func_scores` to compute the functional effects of mutations. We cannot use this method with default options as we have no wildtype counts (needed for normalization) for `lib_2` so we get an error: >>> variants.func_scores('input') Traceback (most recent call last): ... ValueError: no wildtype counts: library sample count 0 lib_1 input 253 1 lib_1 selected 513 2 lib_2 input 0 3 lib_2 selected 0 However, we can use the method with the `permit_zero_wt` option: >>> variants.func_scores('input', permit_zero_wt=True) library pre_sample post_sample barcode func_score func_score_var pre_count post_count pre_count_wt post_count_wt pseudocount codon_substitutions n_codon_substitutions aa_substitutions n_aa_substitutions 0 lib_1 input selected AAC 0.000000 0.024528 253 513 253 513 0.5 0 0 1 lib_1 input selected GAT -2.474376 0.019337 1101 401 253 513 0.5 GGA2CGC 1 G2R 1 2 lib_2 input selected AAC -3.465198 8.345474 1253 113 0 0 0.5 ATG1AAG GGA2TGA 2 M1K G2* 2 3 lib_2 input selected CAT 0.378452 8.329463 923 1200 0 0 0.5 GGA2GGC 1 0 """ def __eq__(self, other): # following here: https://stackoverflow.com/a/390640 if type(other) is not type(self): return False elif self.__dict__.keys() != other.__dict__.keys(): return False else: for key, val in self.__dict__.items(): val2 = getattr(other, key) if isinstance(val, pd.DataFrame): if not val.equals(val2): return False else: if val != val2: return False return True
[docs] @classmethod def from_simulation(cls, *, geneseq, bclen, library_specs, seed=1, variant_call_support=1): """Simulate :class:`CodonVariantTable` variants. Use this method to simulate the variants in a :class:`CodonVariantTable`. Note that this only simulates the variants, not counts for samples. To add those, then use :func:`simulateSampleCounts` and :meth:`CodonVariantTable.addSampleCounts`. Args: `geneseq` (str) Sequence of wildtype protein-coding gene. `bclen` (int) Length of the barcodes; must enable complexity at least 10-fold greater than max number of variants. `library_specs` (dict) Specifications for each simulated library. Keys are 'avgmuts' and 'nvariants', and values are average-codon mutations per variant and number of variants. Mutations per variant are Poisson distributed. `seed` (int or `None`) Random number seed or `None` to set no seed. `variant_call_support` (int or 2-tuple) If an integer, all variant call supports are set to this. If a 2-tuple, variant call support is drawn as a random integer uniformly from this range (inclusive). Returns: The simulated :class:`CodonVariantTable`. """ if seed is not None: scipy.random.seed(seed) random.seed(seed) if len(library_specs) < 1: raise ValueError('empty `library_specs`') if isinstance(variant_call_support, int): variant_call_support = tuple([variant_call_support] * 2) if len(geneseq) % 3 != 0: raise ValueError('length of `geneseq` not multiple of 3') genelength = len(geneseq) // 3 barcode_variant_dict = collections.defaultdict(list) for lib, specs_dict in library_specs.items(): nvariants = specs_dict['nvariants'] avgmuts = specs_dict['avgmuts'] if 10 * nvariants > (len(NTS))**bclen: # safety factor 10 raise ValueError('barcode too short for nvariants') existing_barcodes = set([]) for ivariant in range(nvariants): barcode = ''.join(random.choices(NTS, k=bclen)) while barcode in existing_barcodes: barcode = ''.join(random.choices(NTS, k=bclen)) existing_barcodes.add(barcode) support = random.randint(*variant_call_support) # get mutations substitutions = [] nmuts = scipy.random.poisson(avgmuts) for icodon in random.sample(range(1, genelength + 1), nmuts): wtcodon = geneseq[3 * (icodon - 1): 3 * icodon] mutcodon = random.choice([c for c in CODONS if c != wtcodon]) for i_nt, (wt_nt, mut_nt) in enumerate(zip(wtcodon, mutcodon)): if wt_nt != mut_nt: igene = 3 * (icodon - 1) + i_nt + 1 substitutions.append(f'{wt_nt}{igene}{mut_nt}') substitutions = ' '.join(substitutions) barcode_variant_dict['barcode'].append(barcode) barcode_variant_dict['substitutions'].append(substitutions) barcode_variant_dict['library'].append(lib) barcode_variant_dict['variant_call_support'].append(support) barcode_variants = pd.DataFrame(barcode_variant_dict) with tempfile.NamedTemporaryFile(mode='w') as f: barcode_variants.to_csv(f, index=False) f.flush() cvt = cls(barcode_variant_file=f.name, geneseq=geneseq) return cvt
[docs] @classmethod def from_variant_count_df(cls, *, variant_count_df_file, geneseq, drop_all_libs=True): """:class:`CodonVariantTable` from CSV of `variant_count_df`. Use this method when you have written a CSV file of the `variant_count_df` attribute of a :class:`CodonVariantTable`, and now wish to re-initialize that :class:`CodonVariantTable`. Args: `variant_count_df_file` (str) Name of CSV file containing the `variant_count_df`. Must have following columns: "barcode", "library", "variant_call_support", "codon_substitutions", "sample", and "count". `geneseq` (str) Sequence of wildtype protein-coding gene. `drop_all_libs` (bool) If there is a library named "all libraries", drop it from the list of libraries in the created :class:`CodonVariantTable` as it probably was added by :meth:`CodonVariantTable.addMergedLibraries` and duplicates information for the individual libraries. Returns: The :class:`CodonVariantTable` used to write `variant_count_df_file`. """ df = pd.read_csv(variant_count_df_file) req_cols = ['barcode', 'library', 'variant_call_support', 'codon_substitutions', 'sample', 'count'] if not (set(req_cols) < set(df.columns)): raise ValueError(f"{variant_count_df} lacks required " f"columns {req_cols}") else: df = df[req_cols] if drop_all_libs: dropcol = "all libraries" if dropcol in df['library'].unique(): df = df.query('library != @dropcol') with tempfile.NamedTemporaryFile(mode='w') as f: (df .drop(columns=['sample', 'count']) .rename(columns={'codon_substitutions':'substitutions'}) .drop_duplicates() .to_csv(f, index=False) ) f.flush() cvt = cls(barcode_variant_file=f.name, geneseq=geneseq, substitutions_are_codon=True) for sample in df['sample'].unique(): for lib in cvt.libraries: idf = df.query('sample == @sample & library == @lib') if len(idf): cvt.addSampleCounts(lib, sample, idf[['barcode', 'count']] ) return cvt
def __init__(self, *, barcode_variant_file, geneseq, substitutions_are_codon=False, extra_cols=[]): """See main class doc string.""" self.geneseq = geneseq.upper() if not re.match('^[ATGC]+$', self.geneseq): raise ValueError(f"invalid nucleotides in {self.geneseq}") if ((len(geneseq) % 3) != 0) or len(geneseq) == 0: raise ValueError(f"`geneseq` of invalid length {len(self.geneseq)}") self.sites = list(range(1, len(self.geneseq) // 3 + 1)) self.codons = collections.OrderedDict([ (r, self.geneseq[3 * (r - 1) : 3 * r]) for r in self.sites]) self.aas = collections.OrderedDict([ (r, CODON_TO_AA[codon]) for r, codon in self.codons.items()]) df = pd.read_csv(barcode_variant_file) required_cols = ['library', 'barcode', 'substitutions', 'variant_call_support'] if not set(df.columns).issuperset(set(required_cols)): raise ValueError("`variantfile` does not have " f"required columns {required_cols}") if extra_cols and not set(df.columns).issuperset(set(extra_cols)): raise ValueError("`variantfile` does not have " f"`extra_cols` {extra_cols}") df = df[required_cols + extra_cols] self.libraries = sorted(df.library.unique().tolist()) self._valid_barcodes = {} for lib in self.libraries: barcodes = df.query('library == @lib').barcode if len(set(barcodes)) != len(barcodes): raise ValueError(f"duplicated barcodes for {lib}") self._valid_barcodes[lib] = set(barcodes) self._samples = {lib:[] for lib in self.libraries} self.variant_count_df = None if substitutions_are_codon: codonSubsFunc = self._sortCodonMuts else: codonSubsFunc = self._ntToCodonMuts self.barcode_variant_df = ( df # info about codon and amino-acid substitutions .assign(codon_substitutions= lambda x: x.substitutions .fillna('') .apply(codonSubsFunc), aa_substitutions= lambda x: x.codon_substitutions .apply(self.codonToAAMuts), n_codon_substitutions= lambda x: x.codon_substitutions .str .split() .apply(len), n_aa_substitutions= lambda x: x.aa_substitutions .str .split() .apply(len) ) # we no longer need initial `substitutions` column .drop('substitutions', axis='columns') # sort to ensure consistent order .assign(library=lambda x: pd.Categorical( x.library, categories=self.libraries, ordered=True ) ) .sort_values(['library', 'barcode']) .reset_index(drop=True) ) # check validity of codon substitutions given `geneseq` for codonmut in itertools.chain.from_iterable( self.barcode_variant_df .codon_substitutions.str.split()): m = re.match('^(?P<wt>[ATGC]{3})' '(?P<r>\d+)' '(?P<mut>[ATGC]{3})$', codonmut) if m is None: raise ValueError(f"invalid mutation {codonmut}") wt = m.group('wt') r = int(m.group('r')) mut = m.group('mut') if r not in self.sites: raise ValueError(f"invalid site {r} in codon mutation {codonmut}") if self.codons[r] != wt: raise ValueError(f"Wrong wildtype codon in {codonmut}. " f"Expected wildtype of {codons[r]}.") if wt == mut: raise ValueError(f"invalid mutation {codonmut}") # define some colors for plotting self._mutation_type_colors = { 'nonsynonymous':CBPALETTE[1], 'synonymous':CBPALETTE[2], 'stop':CBPALETTE[3] }
[docs] def samples(self, library): """List of all samples for `library`. Args: `library` (str) Valid `library` for the :class:`CodonVariantTable`. Returns: List of all samples for which barcode counts have been added. """ try: return self._samples[library] except KeyError: raise ValueError(f"invalid `library` {library}")
[docs] def addSampleCounts(self, library, sample, barcodecounts): """Add variant counts for a sample to `variant_count_df`. Args: `library` (str) Valid `library` for the :class:`CodonVariantTable`. `sample` (str) Sample name, must **not** already be in :class:`CodonVariantTable.samples` for `library`. `barcodecounts` (pandas DataFrame) Gives counts for each variant by barcode. Must have columns named "barcode" and "count". The "barcode" column must contain all the barcodes in :class:`CodonVariantTable.valid_barcodes` for `library`. Such data frames are returned by :class:`dms_tools2.barcodes.IlluminaBarcodeParser.parse`. """ if library not in self.libraries: raise ValueError(f"invalid library {library}") if sample in self.samples(library): raise ValueError(f"`library` {library} already " f"has `sample` {sample}") req_cols = ['barcode', 'count'] if not set(barcodecounts.columns).issuperset(set(req_cols)): raise ValueError(f"`barcodecounts` lacks columns {req_cols}") if len(barcodecounts) != len(set(barcodecounts.barcode.unique())): raise ValueError("`barcodecounts` has non-unique barcodes") if set(barcodecounts.barcode.unique()) != self.valid_barcodes(library): raise ValueError("barcodes in `barcodecounts` do not match " f"those expected for `library` {library}") self._samples[library].append(sample) df = (barcodecounts [req_cols] .assign(library=library, sample=sample) .merge(self.barcode_variant_df, how='inner', on=['library', 'barcode'], sort=False, validate='one_to_one') ) if self.variant_count_df is None: self.variant_count_df = df else: self.variant_count_df = pd.concat( [self.variant_count_df, df], axis='index', ignore_index=True, sort=False ) # samples in order added after ordering by library, getting # unique ones as here: https://stackoverflow.com/a/39835527 unique_samples = list(collections.OrderedDict.fromkeys( itertools.chain.from_iterable( [self.samples(lib) for lib in self.libraries]) )) # make library and sample categorical and sort self.variant_count_df = ( self.variant_count_df .assign(library=lambda x: pd.Categorical( x.library, categories=self.libraries, ordered=True ), sample=lambda x: pd.Categorical( x['sample'], categories=unique_samples, ordered=True ) ) .sort_values(['library', 'sample', 'count'], ascending=[True, True, False]) .reset_index(drop=True) )
[docs] def valid_barcodes(self, library): """Set of valid barcodes for `library`.""" if library not in self.libraries: raise ValueError(f"invalid `library` {library}; " f"valid libraries are {self.libraries}") else: return self._valid_barcodes[library]
[docs] def func_scores(self, preselection, *, pseudocount=0.5, by="barcode", combine_libs=False, syn_as_wt=False, logbase=2, permit_zero_wt=False, permit_self_comp=False): """Get data frame with functional scores for variants. The functional score is calculated from the change in counts for a variant pre- and post-selection using the formula in `Otwinoski et al (2018) <https://doi.org/10.1073/pnas.1804015115>`_. Specifically, if :math:`n^v_{pre}` and :math:`n^v_{post}` are the counts of variant :math:`v` pre- and post-selection, and if :math:`n^{wt}_{pre}` and :math:`n^{wt}_{post}` are the summed counts of **all** wildtype variants pre- and post- selection, then the functional score of the variant is :math:`f_v = \log_b\left(\\frac{n^v_{post} / n^{wt}_{post}}{n^v_{pre} / n^{wt}_{pre}}\\right)` and the variance due to Poisson sampling statistics is :math:`\\frac{1}{\left(\ln b\\right)^2}\left(\sigma^2_v = \\frac{1}{n^v_{post}} + \\frac{1}{n^{wt}_{post}} + \\frac{1}{n^v_{pre}} + \\frac{1}{n^{wt}_{pre}}\\right)` where :math:`b` is logarithm base (see `logbase` argument). For both calculations, a pseudocount (see `pseudocount` argument) is added to each count first. The wildtype counts are computed across all **fully wildtype** variants (see `syn_as_wt` for how this is defined). Args: `preselection` (str or dict) The pre-selection sample. If the same for all post- selection samples, then provide the name of the pre-selection sample. If it differs among post-selection samples, then provide a dict keyed by each post-selection sample with the pre-selection sample being the value. `pseudocount` (float) The pseudocount added to each count. `by` (str) Compute effects for each "barcode", set of "aa_substitutions", or set of "codon_substitutions". In the latter two cases, all barcodes with each set of substitutions are combined (see `combine_libs`). Note that if you use "aa_substitutions" then it may be more sensible to set `syn_as_wt` to `True`. `syn_as_wt` (bool) In the formula for the functional scores, consider variants with only synonymous mutations when determining wildtype counts? If `False`, only variants with **no** mutations of any type contribute to wildtype counts. `combine_libs` (bool) If `by` is "aa_substitutions" or "codon_substitutions", do we combine across libraries as well as barcodes? `logbase` (float) Base for logarithm when calculating functional score. `permit_zero_wt` (bool) If the wildtype counts are zero for any sample, do we raise an error or permit the calculation to proceed just using the pseudocount? `permit_self_comp` (bool) Permit comparisons where the same sample is used as the pre- and post-selection? Returns: A pandas Data Frame with the following columns: - "library": the library ("all libraries" if `combine_libs`) - "pre_sample": the pre-selection sample - "post_sample": the post-selection sample - the value corresponding to the grouping we are using to compute effects (the value of `by`) - "func_score": the functional score - "func_score_var": variance on the functional score - "pre_count": pre-selection counts - "post_count: post-selection counts - "pre_count_wt": pre-selection counts for all wildtype - "post_count_wt": post-selection counts for all wildtype - "pseudocount": the pseudocount value - as many of "aa_substitutions", "n_aa_substitutions", "codon_substitutions", and "n_codon_substitutions" as can be retained given the value of `by`. """ ordered_samples = self.variant_count_df['sample'].unique() if isinstance(preselection, str): # make `preselection` into dict preselection = {s:preselection for s in ordered_samples if s != preselection or permit_self_comp} elif not isinstance(preselection, dict): raise ValueError('`preselection` not str or dict') if not permit_self_comp: if any(pre == post for pre, post in preselection.items()): raise ValueError('`permit_self_comp` is False but there' ' are identical pre and post samples') # all samples of interest samples = set(preselection.keys()).union(set(preselection.values())) if not samples.issubset(set(ordered_samples)): extra_samples = samples - set(ordered_samples) raise ValueError(f"invalid samples: {extra_samples}") # get data frame with samples of interest if self.variant_count_df is None: raise ValueError('no sample variant counts have been added') df = (self.variant_count_df .query('sample in @samples') ) if combine_libs and (len(self.libraries) > 1): if any(s not in self.samples(lib) for lib in self.libraries for s in samples): raise ValueError('cannot use `combine_libs`, not every ' f"library has every sample: {samples}") df = (self.addMergedLibraries(df) .query('library == "all libraries"') ) # get wildtype counts for each sample and library if syn_as_wt: wt_col = 'n_aa_substitutions' else: wt_col = 'n_codon_substitutions' wt_counts = ( df .assign(count=lambda x: x['count'] * (0 == x[wt_col]) .astype('int')) .groupby(['library', 'sample']) .aggregate({'count':'sum'}) .reset_index() ) if (wt_counts['count'] <= 0).any() and not permit_zero_wt: raise ValueError(f"no wildtype counts:\n{wt_counts}") # sum counts in groups specified by `by` group_cols = ['codon_substitutions', 'n_codon_substitutions', 'aa_substitutions', 'n_aa_substitutions'] if by in {'aa_substitutions', 'codon_substitutions'}: group_cols = group_cols[group_cols.index(by) + 1 : ] elif by != 'barcode': raise ValueError(f"invalid `by` of {by}") df = (df .groupby(['library', 'sample', by] + group_cols) .aggregate({'count':'sum'}) .reset_index() ) # get data frame with pre- and post-selection samples / counts df_func_scores = [] for post_sample in ordered_samples: if post_sample not in preselection: continue pre_sample = preselection[post_sample] sample_dfs = [] for stype, s in [('pre', pre_sample), ('post', post_sample)]: sample_dfs.append( df .query('sample == @s') .rename(columns={'count':f"{stype}_count"}) .merge(wt_counts .rename(columns={'count':f"{stype}_count_wt"}), how='inner', validate='many_to_one' ) .rename(columns={'sample':f"{stype}_sample"}) ) df_func_scores.append( pd.merge(sample_dfs[0], sample_dfs[1], how='inner', validate='1:1') ) df_func_scores = pd.concat(df_func_scores, ignore_index=True, sort=False) # check pseudocount if pseudocount < 0: raise ValueError(f"`pseudocount` is < 0: {pseudocount}") elif (pseudocount == 0) and any((df_func_scores[c] <= 0).any() for c in ['pre_count', 'post_count', 'pre_count_wt', 'post_count_wt']): raise ValueError('some counts are zero, you must use ' '`pseudocount` > 0') # calculate functional score and variance df_func_scores = ( df_func_scores .assign( pseudocount=pseudocount, func_score=lambda x: scipy.log( ((x.post_count + x.pseudocount) / (x.post_count_wt + x.pseudocount)) / ((x.pre_count + x.pseudocount) / (x.pre_count_wt + x.pseudocount)) ) / scipy.log(logbase), func_score_var=lambda x: (1 / (x.post_count + x.pseudocount) + 1 / (x.post_count_wt + x.pseudocount) + 1 / (x.pre_count + x.pseudocount) + 1 / (x.pre_count_wt + x.pseudocount) ) / (scipy.log(logbase)**2) ) # set column order in data frame [['library', 'pre_sample', 'post_sample', by, 'func_score', 'func_score_var', 'pre_count', 'post_count', 'pre_count_wt', 'post_count_wt', 'pseudocount'] + group_cols] ) return df_func_scores
[docs] def n_variants_df(self, *, libraries='all', samples='all', min_support=1, variant_type='all', mut_type=None): """Number of variants per library / sample. Args: `variant_type` ("single" or "all") Include just variants with <= 1 `mut_type` mutation. `mut_type` ("aa", "codon", or `None`) If `variant_type` is single, set to "codon" or "aa" to indicate how to filter for single mutants. All other args: Same meaning as for :class:`CodonVariantTable.plotNumMutsHistogram`. Returns: DataFrame giving number of variants per library / sample. """ df, nlibraries, nsamples = self._getPlotData(libraries, samples, min_support) if variant_type == 'single': if mut_type in {'aa', 'codon'}: df = df.query(f"n_{mut_type}_substitutions <= 1") else: raise ValueError('`mut_type` must be "aa" or "single"') elif variant_type != 'all': raise ValueError(f"invalid `variant_type` {variant_type}") return (df .groupby(['library', 'sample']) .aggregate({'count':'sum'}) .reset_index() )
[docs] def mutCounts(self, variant_type, mut_type, *, libraries='all', samples='all', min_support=1): """Get counts of each individual mutation. Args: `variant_type` ("single" or "all") Include just single mutants, or all mutants? Other args: Same meaning as for :class:`CodonVariantTable.plotNumMutsHistogram` Returns: A tidy data frame with columns named "library", "sample", "mutation", "count", "mutation_type", and "site". """ df, nlibraries, nsamples = self._getPlotData(libraries, samples, min_support) samplelist = df['sample'].unique().tolist() librarylist = df['library'].unique().tolist() if mut_type == 'codon': wts = self.codons chars = CODONS mutation_types = ['nonsynonymous', 'synonymous', 'stop'] elif mut_type == 'aa': wts = self.aas chars = AAS_WITHSTOP mutation_types = ['nonsynonymous', 'stop'] else: raise ValueError(f"invalid mut_type {mut_type}") # data frame listing all mutations with count 0 mut_list = [] for r, wt in wts.items(): for mut in chars: if mut != wt: mut_list.append(f'{wt}{r}{mut}') all_muts = pd.concat([ pd.DataFrame({'mutation':mut_list, 'library':library, 'sample':sample, 'count':0}) for library, sample in itertools.product(librarylist, samplelist)]) if variant_type == 'single': df = df.query(f'n_{mut_type}_substitutions == 1') elif variant_type == 'all': df = df.query(f'n_{mut_type}_substitutions >= 1') else: raise ValueError(f"invalid variant_type {variant_type}") def _classify_mutation(mut_str): if mut_type == 'aa': m = re.match('^(?P<wt>[A-Z\*])\d+(?P<mut>[A-Z\*])$', mut_str) wt_aa = m.group('wt') mut_aa = m.group('mut') else: m = re.match('^(?P<wt>[ACTG]{3})\d+(?P<mut>[ACTG]{3})$', mut_str) wt_aa = CODON_TO_AA[m.group('wt')] mut_aa = CODON_TO_AA[m.group('mut')] if wt_aa == mut_aa: return 'synonymous' elif mut_aa == '*': return 'stop' else: return 'nonsynonymous' def _get_site(mut_str): if mut_type == 'aa': m = re.match('^[A-Z\*](?P<site>\d+)[A-Z\*]$', mut_str) else: m = re.match('^[ACTG]{3}(?P<site>\d+)[ACTG]{3}$', mut_str) site = int(m.group('site')) assert site in self.sites return site df = (df .rename(columns= {f"{mut_type}_substitutions":"mutation"} ) [['library', 'sample', 'mutation', 'count']] .pipe(tidy_split, column='mutation') .merge(all_muts, how='outer') .groupby(['library', 'sample', 'mutation']) .aggregate({'count':'sum'}) .reset_index() .assign( library=lambda x: pd.Categorical( x['library'], librarylist, ordered=True), sample=lambda x: pd.Categorical( x['sample'], samplelist, ordered=True), mutation_type=lambda x: pd.Categorical( x['mutation'].apply(_classify_mutation), mutation_types, ordered=True), site=lambda x: x['mutation'].apply(_get_site), ) .sort_values( ['library', 'sample', 'count', 'mutation'], ascending=[True, True, False, True]) .reset_index(drop=True) ) return df
[docs] def plotMutHeatmap(self, variant_type, mut_type, *, count_or_frequency='frequency', libraries='all', samples='all', plotfile=None, orientation='h', widthscale=1, heightscale=1, min_support=1): """Heatmap of mutation counts or frequencies. Args: `count_or_frequency` ("count" or "frequency") Plot mutation counts or frequencies? All other args Same meaning as for :meth:`CodonVariantTable.plotCumulMutCoverage`. Returns: A `plotnine <https://plotnine.readthedocs.io>`_ plot; can be displayed in a Jupyter notebook with `p.draw()`. """ df = self.mutCounts(variant_type, mut_type, samples=samples, libraries=libraries, min_support=min_support) n_variants = (self.n_variants_df(libraries=libraries, samples=samples, min_support=min_support, variant_type=variant_type, mut_type=mut_type) .rename(columns={'count':'nseqs'}) ) # order amino acids by Kyte-Doolittle hydrophobicity, # codons by the amino acid they encode aa_order = [tup[0] for tup in sorted( Bio.SeqUtils.ProtParamData.kd.items(), key=lambda tup: tup[1])] + ['*'] codon_order = list(itertools.chain.from_iterable( [AA_TO_CODONS[aa] for aa in aa_order])) df = (df [['library', 'sample', 'mutation', 'site', 'count']] .merge(n_variants, on=['library', 'sample']) .assign(frequency=lambda x: x['count'] / x['nseqs'], mut_char=lambda x: pd.Categorical( x.mutation.str.extract( '^[A-Z\*]+\d+(?P<mut_char>[A-Z\*]+)$') .mut_char, {'aa':aa_order, 'codon':codon_order}[mut_type], ordered=True) ) ) if count_or_frequency not in {'count', 'frequency'}: raise ValueError(f"invalid count_or_frequency " f"{count_or_frequency}") nlibraries = len(df['library'].unique()) nsamples = len(df['sample'].unique()) if mut_type == 'codon': height_per = 5.5 mut_desc = 'codon' elif mut_type == 'aa': height_per = 1.7 mut_desc = 'amino acid' else: raise ValueError(f"invalid mut_type {mut_type}") if orientation == 'h': facet_str = 'sample ~ library' width = widthscale * (1.6 + 3.5 * nlibraries) height = heightscale * (0.8 + height_per * nsamples) elif orientation == 'v': facet_str = 'library ~ sample' width = widthscale * (1.6 + 3.5 * nsamples) height = heightscale * (0.8 + height_per * nlibraries) else: raise ValueError(f"invalid `orientation` {orientation}") p = (ggplot(df, aes('site', 'mut_char', fill=count_or_frequency)) + geom_tile() + facet_grid(facet_str) + theme(figure_size=(width, height), legend_key=element_blank(), axis_text_y=element_text(size=6) ) + scale_x_continuous( name=f'{mut_desc} site', limits=(min(self.sites) - 1, max(self.sites) + 1), expand=(0, 0) ) + ylab(mut_desc) + scale_fill_cmap('gnuplot') ) if plotfile: p.save(plotfile, height=height, width=width, verbose=False, limitsize=False) return p
[docs] def plotMutFreqs(self, variant_type, mut_type, *, libraries='all', samples='all', plotfile=None, orientation='h', widthscale=1, heightscale=1, min_support=1): """Mutation frequency along length of gene. Args: Args have same meaning as for :meth:`CodonVariantTable.plotCumulMutCoverage`. Returns: A `plotnine <https://plotnine.readthedocs.io>`_ plot; can be displayed in a Jupyter notebook with `p.draw()`. """ df = self.mutCounts(variant_type, mut_type, samples=samples, libraries=libraries, min_support=min_support) n_variants = (self.n_variants_df(libraries=libraries, samples=samples, min_support=min_support, variant_type=variant_type, mut_type=mut_type) .rename(columns={'count':'nseqs'}) ) df = (df .groupby(['library', 'sample', 'mutation_type', 'site']) .aggregate({'count':'sum'}) .reset_index() .merge(n_variants, on=['library', 'sample']) .assign(freq=lambda x: x['count'] / x['nseqs']) ) nlibraries = len(df['library'].unique()) nsamples = len(df['sample'].unique()) if orientation == 'h': facet_str = 'sample ~ library' width = widthscale * (1.6 + 1.8 * nlibraries) height = heightscale * (0.8 + 1 * nsamples) elif orientation == 'v': facet_str = 'library ~ sample' width = widthscale * (1.6 + 1.8 * nsamples) height = heightscale * (0.8 + 1 * nlibraries) else: raise ValueError(f"invalid `orientation` {orientation}") if mut_type == 'aa': mut_desc = 'amino-acid' else: mut_desc = mut_type if height < 3: ylabel = (f'{mut_desc} mutation\nfrequency ' f'({variant_type} mutants)') else: ylabel = (f'{mut_desc} mutation frequency ' f'({variant_type} mutants)') p = (ggplot(df, aes('site', 'freq', color='mutation_type')) + geom_step() + scale_color_manual( [self._mutation_type_colors[m] for m in df.mutation_type.unique().sort_values().tolist()], name='mutation type' ) + scale_x_continuous( name=f'{mut_desc} site', limits=(min(self.sites), max(self.sites)) ) + ylab(ylabel) + facet_grid(facet_str) + theme(figure_size=(width, height), legend_key=element_blank(), legend_text=element_text(size=11) ) ) if plotfile: p.save(plotfile, height=height, width=width, verbose=False) return p
[docs] def plotCumulVariantCounts(self, *, variant_type='all', libraries='all', samples='all', plotfile=None, orientation='h', widthscale=1, heightscale=1, min_support=1, mut_type='aa', tot_variants_hline=True): """Plot number variants with >= that each number of counts. Args: `variant_type` ("single" or "all") Include just variants with <= one mutation of type, `mut_type` or all mutants? `tot_variants_hline` (bool) Include a dotted horizontal line indicating the total number of variants (useful since ones not observed at all will not apparent on plot). Other args: Same meaning as for :meth:`CodonVariantTable.plotNumMutsHistogram`. Returns: `plotnine <https://plotnine.readthedocs.io>`_ plot; can be displayed in a Jupyter notebook with `p.draw()`. """ df, nlibraries, nsamples = self._getPlotData(libraries, samples, min_support) if variant_type == 'single': if mut_type == 'aa': mutstr = 'amino acid' elif mut_type == 'codon': mutstr = mut_type else: raise ValueError(f"invalid `mut_type` {mut_type}") ylabel = f"single {mutstr} variants with >= this many counts" df = df.query(f"n_{mut_type}_substitutions <= 1") elif variant_type == 'all': ylabel = 'variants with >= this many counts' else: raise ValueError(f"invalid `variant_type` {variant_type}") if orientation == 'h': facet_str = 'sample ~ library' width = widthscale * (1 + 1.5 * nlibraries) height = heightscale * (0.6 + 1.5 * nsamples) elif orientation == 'v': facet_str = 'library ~ sample' width = widthscale * (1 + 1.5 * nsamples) height = heightscale * (0.6 + 1.5 * nlibraries) else: raise ValueError(f"invalid `orientation` {orientation}") df = (getCumulVariantsByCount(df, group_cols=['library', 'sample']) .query('count > 0') ) p = (ggplot(df, aes('count', 'nvariants')) + geom_step() + facet_grid(facet_str) + xlab('number of counts') + ylab(ylabel) + scale_x_log10(labels=latexSciNot) + scale_y_continuous(labels=latexSciNot) + theme(figure_size=(width, height)) ) if tot_variants_hline: p = p + geom_hline(aes(yintercept='total_variants'), linetype='dashed', color=CBPALETTE[1]) if plotfile: p.save(plotfile, height=height, width=width, verbose=False) return p
[docs] def plotCumulMutCoverage(self, variant_type, mut_type, *, libraries='all', samples='all', plotfile=None, orientation='h', widthscale=1, heightscale=1, min_support=1, max_count=None): """Fraction of mutation seen <= some number of times. Args: `variant_type` ("single" or "all") Include just single mutants, or all mutants? Mutations are counted relative to `mut_type`. `max_count` (`None` or int) Plot cumulative fraction plot out to this number of observations of mutation. If `None`, a reasonable value is automatically determined. Other args: Same meaning as for :class:`CodonVariantTable.plotNumMutsHistogram` Returns: A `plotnine <https://plotnine.readthedocs.io>`_ plot; can be displayed in a Jupyter notebook with `p.draw()`. """ df = self.mutCounts(variant_type, mut_type, samples=samples, libraries=libraries, min_support=min_support) # add one to counts to plot fraction found < this many # as stat_ecdf by default does <= df = df.assign(count=lambda x: x['count'] + 1) if max_count is None: max_count = df['count'].quantile(0.75) nlibraries = len(df['library'].unique()) nsamples = len(df['sample'].unique()) if orientation == 'h': facet_str = 'sample ~ library' width = widthscale * (1.6 + 1.3 * nlibraries) height = heightscale * (1 + 1.2 * nsamples) elif orientation == 'v': facet_str = 'library ~ sample' width = widthscale * (1.6 + 1.3 * nsamples) height = heightscale * (1 + 1.2 * nlibraries) else: raise ValueError(f"invalid `orientation` {orientation}") if width > 4: xlabel = f'counts among {variant_type} mutants' else: xlabel = f'counts among\n{variant_type} mutants' mut_desc = {'aa':'amino-acid', 'codon':'codon'}[mut_type] if height > 3: ylabel = f'frac {mut_desc} mutations found < this many times' else: ylabel = f'frac {mut_desc} mutations\nfound < this many times' p = (ggplot(df, aes('count', color='mutation_type')) + stat_ecdf(geom='step', size=0.75) + coord_cartesian(xlim=(0, max_count)) + scale_color_manual( [self._mutation_type_colors[m] for m in df.mutation_type.unique().sort_values().tolist()], name='mutation type' ) + xlab(xlabel) + ylab(ylabel) + facet_grid(facet_str) + theme(figure_size=(width, height), legend_key=element_blank(), legend_text=element_text(size=11), axis_text_x=element_text(angle=90), ) ) if plotfile: p.save(plotfile, height=height, width=width, verbose=False) return p
[docs] def plotNumCodonMutsByType(self, variant_type, *, libraries='all', samples='all', plotfile=None, orientation='h', widthscale=1, heightscale=1, min_support=1): """Nonsynonymous, synonymous, stop mutations per variant. Args: `variant_type` ("single" or "all") Include just single-codon mutants and wildtype, or include all mutants. Other args: Same meaning as for :class:`CodonVariantTable.plotNumMutsHistogram` Returns: A `plotnine <https://plotnine.readthedocs.io>`_ plot; can be displayed in a Jupyter notebook with `p.draw()`. """ df, nlibraries, nsamples = self._getPlotData(libraries, samples, min_support) if variant_type == 'single': df = df.query('n_codon_substitutions <= 1') elif variant_type != 'all': raise ValueError(f"invalid variant_type {variant_type}") if orientation == 'h': facet_str = 'sample ~ library' width = widthscale * (1 + 1.4 * nlibraries) height = heightscale * (1 + 1.3 * nsamples) elif orientation == 'v': facet_str = 'library ~ sample' width = widthscale * (1 + 1.4 * nsamples) height = heightscale * (1 + 1.3 * nlibraries) else: raise ValueError(f"invalid `orientation` {orientation}") if height > 3: ylabel = f'mutations per variant ({variant_type} mutants)' else: ylabel = f'mutations per variant\n({variant_type} mutants)' codon_mut_types = ['nonsynonymous', 'synonymous', 'stop'] # mutations from stop to another amino-acid counted as nonsyn df = (df .assign( synonymous=lambda x: x.n_codon_substitutions - x.n_aa_substitutions, stop=lambda x: x.aa_substitutions.str .findall('[A-Z]\d+\*').apply(len), nonsynonymous=lambda x: x.n_codon_substitutions - x.synonymous - x.stop ) .melt(id_vars=['library', 'sample', 'count'], value_vars=codon_mut_types, var_name='mutation_type', value_name='num_muts') .assign( mutation_type=lambda x: pd.Categorical( x.mutation_type, categories=codon_mut_types, ordered=True), num_muts_count=lambda x: x.num_muts * x['count'] ) .groupby(['library', 'sample', 'mutation_type']) .aggregate({'num_muts_count':'sum', 'count':'sum'}) .reset_index() .assign(number=lambda x: x.num_muts_count / x['count']) ) p = (ggplot(df, aes('mutation_type', 'number', fill='mutation_type', label='number')) + geom_bar(stat='identity') + geom_text(size=8, va='bottom', format_string='{0:.3f}') + facet_grid(facet_str) + scale_y_continuous(name=ylabel, expand=(0.03, 0, 0.12, 0)) + scale_fill_manual( [self._mutation_type_colors[m] for m in df.mutation_type.unique().sort_values().tolist()] ) + theme(figure_size=(width, height), axis_title_x=element_blank(), axis_text_x=element_text(angle=90, size=11), legend_position='none') ) if plotfile: p.save(plotfile, height=height, width=width, verbose=False) return p
[docs] def plotNumMutsHistogram(self, mut_type, *, libraries='all', samples='all', plotfile=None, orientation='h', widthscale=1, heightscale=1, min_support=1, max_muts=None): """Plot histograms of number of mutations per variant. Args: `mut_type` (str) Type of mutation: "codon" or "aa". `libraries` ("all", "all_only", or list) Set to "all" to include all libraries including a merge of the libraries, "all_only" to only include the merge of the libraries, or a list of libraries. `samples` (the str "all", `None`, or list) Set to "all" to include all samples, `None` to just count each barcoded variant once, or specify a list of samples. `plotfile` (`None` or str) Name of file to which we save plot. `orientation` (the str 'h' or 'v') Do we facet libraries horizontally or vertically? `widthscale` (float or int) Expand width of plot by this much. `heightscale` (float or int) Expand height of plot by this much. `min_support` (int) Only include variants with at least this large of a variant call support. `max_muts` (int or `None`) In histogram, group together all variants with >= this many mutations; set to `None` for no cutoff. Returns: `plotnine <https://plotnine.readthedocs.io>`_ plot; can be displayed in a Jupyter notebook with `p.draw()`. """ df, nlibraries, nsamples = self._getPlotData(libraries, samples, min_support) if mut_type == 'aa': mut_col = 'n_aa_substitutions' xlabel = 'amino-acid mutations' elif mut_type == 'codon': mut_col = 'n_codon_substitutions' xlabel = 'codon mutations' else: raise ValueError(f"invalid mut_type {mut_type}") if orientation == 'h': facet_str = 'sample ~ library' width = widthscale * (1 + 1.5 * nlibraries) height = heightscale * (0.6 + 1.5 * nsamples) elif orientation == 'v': facet_str = 'library ~ sample' width = widthscale * (1 + 1.5 * nsamples) height = heightscale * (0.6 + 1.5 * nlibraries) else: raise ValueError(f"invalid `orientation` {orientation}") df[mut_col] = scipy.clip(df[mut_col], None, max_muts) df = (df .groupby(['library', 'sample', mut_col]) .aggregate({'count':'sum'}) .reset_index() ) p = (ggplot(df, aes(mut_col, 'count')) + geom_bar(stat='identity') + facet_grid(facet_str) + xlab(xlabel) + scale_y_continuous(labels=latexSciNot) + theme(figure_size=(width, height)) ) if plotfile: p.save(plotfile, height=height, width=width, verbose=False) return p
[docs] def writeCodonCounts(self, single_or_all, *, outdir=None, include_all_libs=False): """Writes codon counts files for all libraries and samples. Files are written in `format <https://jbloomlab.github.io/dms_tools2/dms2_bcsubamp.html#id12>`_ produced by `dms2_bcsubamp <https://jbloomlab.github.io/dms_tools2/dms2_bcsubamp.html>`_. Args: `single_or_all` ("single" or "all") If "single", then counts are just from single-codon mutants and wildtype, and we count all wildtype codons and just mutated codon for single-codon mutants. If "all", we count all codons for all variants at all sites. This is appropriate if enrichment of each mutation is supposed to represent its effect for "single", and if enrichment of mutation is supposed to represent its average effect across genetic backgrounds in the library for "all" provided mutations are Poisson distributed. `outdir` (`None` or str) Name of directory into which we write counts, created if it does not exist. Use `None` to write to current directory. `include_all_libs` (bool) Include data for a library (named "all-libraries") that has the summmed data for all individual libraries if there are multiple libraries? Returns: Pandas data frame with columns "library", "sample", and "countfile". The "countfile" columns gives name of the createad CSV file, ``<library>_<sample>_codoncounts.csv``. """ codonmatch = re.compile(f'^(?P<wt>{"|".join(CODONS)})' '(?P<r>\d+)' f'(?P<mut>{"|".join(CODONS)})$' ) def _parseCodonMut(mutstr): m = codonmatch.match(mutstr) return (m.group('wt'), int(m.group('r')), m.group('mut')) if self.variant_count_df is None: raise ValueError("no samples with counts") if single_or_all not in {'single', 'all'}: raise ValueError(f"invalid `single_or_all` {single_or_all}") if outdir is not None: os.makedirs(outdir, exist_ok=True) else: outdir = '' if include_all_libs: df = self.addMergedLibraries(self.variant_count_df, all_lib='all-libraries') else: df = self.variant_count_df countfiles = [] liblist = [] samplelist = [] for lib, sample in itertools.product( df['library'].unique().tolist(), df['sample'].unique().tolist() ): i_df = df.query('library == @lib & sample == @sample') if len(i_df) == 0: continue # no data for this library and sample countfile = os.path.join(outdir, f'{lib}_{sample}_codoncounts.csv') countfiles.append(countfile) liblist.append(lib) samplelist.append(sample) codoncounts = {codon:[0] * len(self.sites) for codon in CODONS} if single_or_all == 'single': n_wt = (i_df .query('n_codon_substitutions == 0') ['count'] .sum() ) for isite, site in enumerate(self.sites): codoncounts[self.codons[site]][isite] += n_wt for mut, count in (i_df .query('n_codon_substitutions == 1') [['codon_substitutions', 'count']] .itertuples(index=False, name=None) ): wtcodon, r, mutcodon = _parseCodonMut(mut) codoncounts[mutcodon][r - 1] += count elif single_or_all == 'all': n_wt = i_df['count'].sum() for isite, site in enumerate(self.sites): codoncounts[self.codons[site]][isite] += n_wt for muts, count in (i_df .query('n_codon_substitutions > 0') [['codon_substitutions', 'count']] .itertuples(index=False, name=None) ): for mut in muts.split(): wtcodon, r, mutcodon = _parseCodonMut(mut) codoncounts[mutcodon][r - 1] += count codoncounts[wtcodon][r - 1] -= count else: raise ValueError(f"invalid `single_or_all` {single_or_all}") counts_df = pd.DataFrame(collections.OrderedDict( [('site', self.sites), ('wildtype', [self.codons[r] for r in self.sites])] + [(codon, codoncounts[codon]) for codon in CODONS] )) counts_df.to_csv(countfile, index=False) assert all(map(os.path.isfile, countfiles)) return pd.DataFrame({'library':liblist, 'sample':samplelist, 'countfile':countfiles})
[docs] @staticmethod def classifyVariants(df, *, variant_class_col='variant_class', max_aa=2): """Classifies codon variants in `df`. Args: `df` (pandas DataFrame) Must have columns named 'aa_substitutions', 'n_aa_substitutions', and 'n_codon_substitutions'. For instance, a data frame of this type can be obtained via the `variant_count_df` or `barcode_variant_df` of a :class:`CodonVariantTable`, or via :meth:`CodonVariantTable.func_scores`. `variant_class_col` (str) Name of column added to `df` that contains variant classification. Overwritten if already exists. `max_aa` (int) When classifying variants, group all with >= this many amino-acid mutations. Returns: A copy of `df` with the column specified by `variant_class_col` classifying variants as: - 'wildtype': no codon mutations - 'synonymous': only synonymous codon mutations - 'stop': at least one stop-codon mutation - '{n_aa} nonsynonymous' where `n_aa` is the number of amino-acid mutations, or is '>{max_aa}' if there are more than `max_aa` such mutations. >>> df = pd.DataFrame.from_records( ... [('AAA', '', 0, 0), ... ('AAG', '', 0, 1), ... ('ATA', 'M1* G5K', 2, 3), ... ('GAA', 'G5H', 1, 2), ... ('CTT', 'M1C G5C', 2, 3), ... ('CTT', 'M1A L3T G5C', 3, 3), ... ], ... columns=['barcode', 'aa_substitutions', ... 'n_aa_substitutions', 'n_codon_substitutions'] ... ) >>> CodonVariantTable.classifyVariants(df) barcode aa_substitutions n_aa_substitutions n_codon_substitutions variant_class 0 AAA 0 0 wildtype 1 AAG 0 1 synonymous 2 ATA M1* G5K 2 3 stop 3 GAA G5H 1 2 1 nonsynonymous 4 CTT M1C G5C 2 3 >=2 nonsynonymous 5 CTT M1A L3T G5C 3 3 >=2 nonsynonymous """ req_cols = ['aa_substitutions', 'n_aa_substitutions', 'n_codon_substitutions'] if not (set(req_cols) <= set(df.columns)): raise ValueError(f"`df` does not have columns {req_cols}") def _classify_func(row): if row['n_codon_substitutions'] == 0: return 'wildtype' elif row['n_aa_substitutions'] == 0: return 'synonymous' elif '*' in row['aa_substitutions']: return 'stop' elif row['n_aa_substitutions'] < max_aa: return f"{row['n_aa_substitutions']} nonsynonymous" else: return f">={max_aa} nonsynonymous" return df.assign(**{variant_class_col: lambda x: x.apply(_classify_func, axis=1) })
[docs] @staticmethod def addMergedLibraries(df, *, all_lib='all libraries'): """Add data to `df` for all libraries merged. Args: `df` (pandas DataFrame) DataFrame that includes columns named "library" and "barcode". `all_lib` (str) Name given to library that is merge of all other libraries. Returns: If `df` only has data for one library, just returns `df`. Otherwise returns a copy of `df` that has a new library with the name given by `all_lib` and contains the data for all individual libraries, with the "barcode" column giving the original library name followed by a hyphen and the barcode. """ libs = df.library.unique().tolist() if len(libs) <= 1: return df if all_lib in libs: raise ValueError(f"library {all_lib} already exists") df = (pd.concat([df, df.assign( barcode=lambda x: x.library.str .cat(x.barcode, sep='-'), library=all_lib) ], axis='index', ignore_index=True, sort=False) .assign(library=lambda x: pd.Categorical( x['library'], categories=libs + [all_lib], ordered=True) ) ) return df
def _getPlotData(self, libraries, samples, min_support): """Gets data to plot from library and sample filters. Args: `libraries`, `samples`, `min_support` have meaning as for :class:`CodonVariantTable.plotNumMutsHistogram`. Returns: The 3-tuple `(df, nlibraries, nsamples)` where: - `df`: DataFrame with data to plot. - `nlibraries`: number of libraries being plotted. - `nsamples`: number of samples being plotted. """ if samples is None: df = (self.barcode_variant_df .assign(sample='barcoded variants') .assign(count=1) ) elif samples == 'all': if self.variant_count_df is None: raise ValueError('no samples have been added') df = self.variant_count_df elif isinstance(samples, list): all_samples = set(itertools.chain.from_iterable( self.samples(lib) for lib in self.libraries)) if not all_samples.issuperset(set(samples)): raise ValueError(f"invalid sample(s) in {samples}") if len(samples) != len(set(samples)): raise ValueError(f"duplicate samples in {samples}") df = self.variant_count_df.query('sample in @samples') else: raise ValueError(f"invalid `samples` {samples}") df = df.query('variant_call_support >= @min_support') if not len(df): raise ValueError(f"no samples {samples}") else: nsamples = len(df['sample'].unique()) if libraries == 'all': df = self.addMergedLibraries(df) elif libraries == 'all_only': df = (self.addMergedLibraries(df) .query('library == "all libraries"') ) elif isinstance(libraries, list): if not set(self.libraries).issuperset(set(libraries)): raise ValueError(f"invalid library in {libraries}") if len(libraries) != len(set(libraries)): raise ValueError(f"duplicate library in {libraries}") df = df.query('library in @libraries') else: raise ValueError(f"invalid `libraries` {libraries}") if not len(df): raise ValueError(f"no libraries {libraries}") else: nlibraries = len(df['library'].unique()) return (df, nlibraries, nsamples)
[docs] @classmethod def codonToAAMuts(self, codon_mut_str): """Converts string of codon mutations to amino-acid mutations. Args: `codon_mut_str` (str) Codon mutations, delimited by a space and in 1, 2, ... numbering. Returns: String with amino acid mutations in 1, 2, ... numbering. >>> CodonVariantTable.codonToAAMuts('ATG1GTG GGA2GGC TGA3AGA') 'M1V *3R' """ aa_muts = {} for mut in codon_mut_str.upper().split(): m = re.match('^(?P<wt>[ATGC]{3})(?P<r>\d+)(?P<mut>[ATGC]{3})$', mut) if not m: raise ValueError(f"invalid mutation {mut} in {codon_mut_str}") r = int(m.group('r')) if r in aa_muts: raise ValueError(f"duplicate codon mutation for {r}") wt_codon = m.group('wt') mut_codon = m.group('mut') if wt_codon == mut_codon: raise ValueError(f"invalid mutation {mut}") wt_aa = CODON_TO_AA[wt_codon] mut_aa = CODON_TO_AA[mut_codon] if wt_aa != mut_aa: aa_muts[r] = f"{wt_aa}{r}{mut_aa}" return ' '.join([mut_str for r, mut_str in sorted(aa_muts.items())])
def _sortCodonMuts(self, mut_str): """Sort space-delimited codon mutations and make uppercase. >>> geneseq = 'ATGGGATGA' >>> with tempfile.NamedTemporaryFile(mode='w') as f: ... _ = f.write('library,barcode,substitutions,variant_call_support') ... f.flush() ... variants = CodonVariantTable( ... barcode_variant_file=f.name, ... geneseq=geneseq ... ) >>> variants._sortCodonMuts('GGA2CGT ATG1GTG') 'ATG1GTG GGA2CGT' """ muts = {} for mut in mut_str.upper().split(): m = re.match('^(?P<wt>[ATCG]{3})(?P<r>\d+)(?P<mut>[ACTG]{3})$', mut) if not m: raise ValueError(f"invalid codon mutation {mut}") wt_codon = m.group('wt') r = int(m.group('r')) mut_codon = m.group('mut') if wt_codon == mut_codon: raise ValueError(f"invalid codon mutation {mut}") if r not in self.sites: raise ValueError(f"invalid site in codon mutation {mut}") if wt_codon != self.codons[r]: raise ValueError(f"invalid wt in codon mutation {mut}") if r in muts: raise ValueError(f"duplicate mutation at codon {mut}") muts[r] = mut return ' '.join(mut for r, mut in sorted(muts.items())) def _ntToCodonMuts(self, nt_mut_str): """Converts string of nucleotide mutations to codon mutations. Args: `nt_mut_str` (str) Nucleotide mutations, delimited by a space and in 1, 2, ... numbering. Returns: String with codon mutations in 1, 2, ... numbering of codon sites. >>> geneseq = 'ATGGGATGA' >>> with tempfile.NamedTemporaryFile(mode='w') as f: ... _ = f.write('library,barcode,substitutions,variant_call_support') ... f.flush() ... variants = CodonVariantTable( ... barcode_variant_file=f.name, ... geneseq=geneseq ... ) >>> variants._ntToCodonMuts('A1G G4C A6T') 'ATG1GTG GGA2CGT' >>> variants._ntToCodonMuts('G4C A6T A1G') 'ATG1GTG GGA2CGT' >>> variants._ntToCodonMuts('A1G G4C G6T') Traceback (most recent call last): ... ValueError: nucleotide 6 should be A not G """ mut_codons = collections.defaultdict(set) for mut in nt_mut_str.upper().split(): m = re.match('^(?P<wt>[ATCG])(?P<i>\d+)(?P<mut>[ATCG])$', mut) if not m: raise ValueError(f"invalid mutation {mut}") wt_nt = m.group('wt') i = int(m.group('i')) mut_nt = m.group('mut') if wt_nt == mut_nt: raise ValueError(f"invalid mutation {mut}") if i > len(self.geneseq) or i < 1: raise ValueError(f"invalid nucleotide site {i}") if self.geneseq[i - 1] != wt_nt: raise ValueError(f"nucleotide {i} should be " f"{self.geneseq[i - 1]} not {wt_nt}") icodon = (i - 1) // 3 + 1 i_nt = (i - 1) % 3 assert self.codons[icodon][i_nt] == wt_nt if i_nt in mut_codons[icodon]: raise ValueError(f"duplicate mutations {i_nt} in {icodon}") mut_codons[icodon].add((i_nt, mut_nt)) codon_mut_list = [] for r, r_muts in sorted(mut_codons.items()): wt_codon = self.codons[r] mut_codon = list(wt_codon) for i, mut_nt in r_muts: mut_codon[i] = mut_nt codon_mut_list.append(f"{wt_codon}{r}{''.join(mut_codon)}") return ' '.join(codon_mut_list)
[docs]def simulateSampleCounts(*, variants, phenotype_func, variant_error_rate, pre_sample, post_samples, pre_sample_name='pre-selection', seed=1): """Simulate pre- and post-selection variant counts. Simulate variant counts for experiment where barcodes are sequenced pre- and post-selection. Args: `variants` (:class:`CodonVariantTable`) Holds variants used in simulation. `phenotype_func` (function) Takes a row from `variants.barcode_variant_df` and returns the phenotype. Typically this is calculated from the "aa_substitutions" or "codon_substitutions" column. The phenotype is a number >= 0, and represents the expected enrichment of the variant relative to wildtype post-selection (values > 1 indicate beneficial). For instance, you could pass :meth:`PhenotypeSimulator.observedPhenotype`. `variant_error_rate` (float) Rate at which variants in `variants` are mis-called. Provide the probability that a variant has a spuriously called (or missing) codon mutation: with this probability, each variant then has a random codon mutation added or taken away before being passed to `phenotype_func`. Designed to simulate the fact that the method used to link barcodes to variants is probably not perfect. `pre_sample` (pandas DataFrame or dict) The counts of each variant pre-selection. You can specify counts or have them simulated: - To specify counts, provide a data frame with columns "library", "barcode", and "count" where "count" is the pre-selection counts. - To simulate, provide dict with keys "total_count" and "uniformity". For each library, we simulate pre-selection counts as a draw of "total_count" counts from a multinomial parameterized by pre-selection frequences drawn from a Dirichlet distribution with a concentration parameter of of "uniformity" (larger uniformity values mean more even libraries; 5 is reasonable value). `post_samples` (dict) Nested dict indicating post-selection samples. Keyed by name of each post-selection sample, with value being another dict. The post-selection counts are drawn from a multinomial parameterized by the pre-selection frequencies (frequencies calculated from counts if `pre_sample` gives counts, Dirichlet frequencies if `pre_sample` is simulated). Add noise by the following parameters provided in the dict for each sample: - "total_count": total overall counts per library. - "noise": add additional noise to selection by multiplying phenotype times a random variable drawn from a normal distribution with mean 1 and this standard deviation (truncated at lower end to zero). Set noise to 0 for no noise. - "bottleneck": put the pre-selection frequencies through a bottleneck of this size, then re-calcuate initial frequencies that selection acts upon. Set to `None` for no bottleneck. `pre_sample_name` (str) Name used for the pre-selection sample. `seed` (None or int) If not `None`, random number seed set before executing function (sets `scipy.random.seed`). Returns: A pandas DataFrame with the following columns: - "library" - "barcode" - "sample" - "count" The first two columns indicate the library and barcode for each variant as in the `barcode_variant_df` attribute of `variants`, the "sample" and "count" columns give counts for each sample. """ if seed is not None: scipy.random.seed(seed) if pre_sample_name in post_samples: raise ValueError('`pre_sample_name` is in `post_samples`') #----------------------------------------- # internal function def _add_variant_errors(codon_substitutions): """Add errors to variant according to `variant_error_rate`.""" if scipy.random.random() < variant_error_rate: muts = codon_substitutions.split() if len(muts) == 0 or scipy.random.random() < 0.5: # add mutation mutatedsites = set(map( int, [re.match('^[ATGC]{3}(?P<site>\d+)[ATGC]{3}$', mut).group('site') for mut in muts] )) unmutatedsites = [r for r in variants.sites if r not in mutatedsites] if not unmutatedsites: raise RuntimeError("variant already has all mutations") errorsite = scipy.random.choice(unmutatedsites) wtcodon = variants.codons[errorsite] mutcodon = scipy.random.choice([c for c in CODONS if c != wtcodon]) muts.append(f'{wtcodon}{errorsite}{mutcodon}') return ' '.join(muts) else: # remove mutation muts = muts.pop(scipy.random.randint(0, len(muts))) return muts else: return codon_substitutions #----------------------------------------- barcode_variant_df = ( variants.barcode_variant_df [['library', 'barcode', 'codon_substitutions']] .assign( codon_substitutions=lambda x: x.codon_substitutions .apply(_add_variant_errors), aa_substitutions=lambda x: x.codon_substitutions .apply(CodonVariantTable.codonToAAMuts), phenotype=lambda x: x.apply(phenotype_func, axis=1) ) [['library', 'barcode', 'phenotype']] ) libraries = variants.libraries if isinstance(pre_sample, pd.DataFrame): # pre-sample counts specified req_cols = ['library', 'barcode', 'count'] if not set(req_cols).issubset(set(pre_sample.columns)): raise ValueError(f"pre_sample lacks cols {req_cols}:" f"\n{pre_sample}") cols = ['library', 'barcode'] if (pre_sample[cols].sort_values(cols).reset_index() != barcode_variant_df[cols].sort_values(cols).reset_index() ).any(): raise ValueError("pre_sample DataFrame lacks required " "library and barcode columns") barcode_variant_df = ( barcode_variant_df .merge(pre_sample[req_cols], on=['library', 'barcode']) .rename(columns={'count':pre_sample_name}) ) # "true" pre-selection freqs are just input counts nperlib = (barcode_variant_df .groupby('library') .rename('total_count') .reset_index() ) barcode_variant_df = ( barcode_variant_df .merge(nperlib, on='library') .assign(pre_freqs=lambda x: x[pre_sample_name] / x.total_count) .drop(columns='total_count') ) elif isinstance(pre_sample, dict): pre_req_keys = {'uniformity', 'total_count'} if set(pre_sample.keys()) != pre_req_keys: raise ValueError(f"pre_sample lacks required keys {pre_req_keys}") pre_df_list = [] for lib in libraries: df = ( barcode_variant_df .query('library == @lib') .assign( pre_freq=lambda x: scipy.random.dirichlet( pre_sample['uniformity'] * scipy.ones(len(x))), count=lambda x: scipy.random.multinomial( pre_sample['total_count'], x.pre_freq), sample=pre_sample_name ) ) pre_df_list.append(df) barcode_variant_df = pd.concat(pre_df_list) else: raise ValueError("pre_sample not DataFrame / dict: " f"{pre_sample}") cols = ['library', 'barcode', 'sample', 'count', 'pre_freq', 'phenotype'] assert set(barcode_variant_df.columns) == set(cols), ( f"cols = {set(cols)}\nbarcode_variant_df.columns = " f"{set(barcode_variant_df.columns)}") for col in cols: if col in post_samples: raise ValueError(f"post_samples can't have key {col}; " "choose another sample name") df_list = [barcode_variant_df[cols[ : 4]]] def _bottleneck_freqs(pre_freq, bottleneck): if bottleneck is None: return pre_freq else: return scipy.random.multinomial(bottleneck, pre_freq) / bottleneck post_req_keys = {'bottleneck', 'noise', 'total_count'} for lib, (sample, sample_dict) in itertools.product( libraries, sorted(post_samples.items())): if set(sample_dict.keys()) != post_req_keys: raise ValueError(f"post_samples {sample} lacks keys {post_req_keys}") lib_df = ( barcode_variant_df.query('library == @lib') .assign( sample=sample, # simulated pre-selection freqs after bottleneck bottleneck_freq=lambda x: _bottleneck_freqs(x.pre_freq, sample_dict['bottleneck']), # post-selection freqs with noise noise=scipy.clip(scipy.random.normal(1, sample_dict['noise']), 0, None), post_freq_nonorm=lambda x: x.bottleneck_freq * x.phenotype * x.noise, post_freq=lambda x: x.post_freq_nonorm / x.post_freq_nonorm.sum(), # post-selection counts simulated from frequencies count=lambda x: scipy.random.multinomial( sample_dict['total_count'], x.post_freq) ) .rename(columns={'post_counts':sample}) [['library', 'barcode', 'sample', 'count']] ) df_list.append(lib_df) return pd.concat(df_list)
[docs]class PhenotypeSimulator: """Simulates phenotypes of variants under plausible model. Mutational effects on latent phenotype are simulated to follow compound normal distribution; latent phenotype maps to observed phenotype via sigmoid. This distinction between latent and observed phenotype parallel the "global epistasis" models of `Otwinoski et al <https://doi.org/10.1073/pnas.1804015115>`_ and `Sailer and Harms <http://www.genetics.org/content/205/3/1079>`_. Initialize with a codon sequence and random number seed. The effects of mutations on the latent phenotype are then drawn from a compound normal distribution biased to negative values. To calculate latent and observed phenotypes, pass rows of a :meth:`CodonVariantTable.barcode_variant_df` to :meth:`PhenotypeSimulator.latentPhenotype` or :meth:`PhenotypeSimulator.observedPhenotype`. Args: `geneseq` (str) Codon sequence of wild-type gene. `seed` (int) Random number seed. `wt_latent` (float) Latent phenotype of wildtype. `norm_weights` (list of tuples) Specifies compound normal distribution of mutational effects on latent phenotype. Each tuple is `(weight, mean, sd)`, giving weight, mean, and standard deviation of each Gaussian in compound normal. `stop_effect` (float) Effect of stop codon at any position. Attributes: `wt_latent` (float) Value for wildtype latent phenotype. `muteffects` (dict) Effects on latent phenotype of each amino-acid mutation. """ def __init__(self, geneseq, *, seed=1, wt_latent=1, norm_weights=[(0.4, -0.5, 1), (0.6, -5, 2.5)], stop_effect=-10): """See main class docstring for how to initialize.""" self.wt_latent = wt_latent # simulate muteffects from compound normal distribution self.muteffects = {} scipy.random.seed(seed) weights, means, sds = zip(*norm_weights) cumweights = scipy.cumsum(weights) for icodon in range(len(geneseq) // 3): wt_aa = CODON_TO_AA[geneseq[3 * icodon : 3 * icodon + 3]] for mut_aa in AAS_WITHSTOP: if mut_aa != wt_aa: if mut_aa == '*': muteffect = stop_effect else: # choose Gaussian from compound normal i = scipy.argmin(cumweights < scipy.random.rand()) # draw mutational effect from chosen Gaussian muteffect = scipy.random.normal(means[i], sds[i]) self.muteffects[f'{wt_aa}{icodon + 1}{mut_aa}'] = muteffect
[docs] def latentPhenotype(self, v): """Returns latent phenotype of a variant. Args: `v` (dict or row of pandas DataFrame) Must have key 'aa_substitutions' that gives space de-limited list of amino-acid mutations. """ return self.wt_latent + sum([self.muteffects[m] for m in v['aa_substitutions'].split()])
[docs] def observedPhenotype(self, v): """Like `latentPhenotype` but returns observed phenotype.""" return self.latentToObservedPhenotype(self.latentPhenotype(v))
[docs] @staticmethod def latentToObservedPhenotype(latent): """Returns observed phenotype from latent phenotype.""" return 1 / (1 + math.exp(-latent - 3))
[docs] def plotLatentVersusObservedPhenotype(self, *, latent_min=-15, latent_max=5, npoints=200): """Plots observed phenotype as function of latent phenotype. Plot includes a vertical line at wildtype latent phenotype. Args: `latent_min` (float) Smallest value of latent phenotype on plot. `latent_max` (float) Largest value of latent phenotype on plot. `npoints` (int) Plot a line fit to this many points. Returns: A `plotnine <https://plotnine.readthedocs.io>`_ plot; can be displayed in a Jupyter notebook with `p.draw()`. """ latent = scipy.linspace(latent_min, latent_max, npoints) p = (ggplot(pd.DataFrame(dict(latent=latent)) .assign(observed=lambda x: x.latent.apply( self.latentToObservedPhenotype)), aes('latent', 'observed') ) + geom_line() + geom_vline(xintercept=self.wt_latent, color=CBPALETTE[1], linetype='dashed') + theme(figure_size=(3.5, 2.5)) + xlab('latent phenotype') + ylab('observed phenotype') ) return p
[docs] def plotMutsHistogram(self, latent_or_observed, *, mutant_order=1, bins=30): """Plots distribution of phenotype for all mutants. Plot includes a vertical line at wildtype phenotype. Args: `latent_or_observed` ("latent" or "observed") Which type of phenotype to plot. `mutant_order` (int) Plot mutations of this order. Currently only works for 1 (single mutants). `bins` (int) Number of bins in histogram. Returns: A `plotnine <https://plotnine.readthedocs.io>`_ plot; can be displayed in a Jupyter notebook with `p.draw()`. """ if mutant_order != 1: raise ValueError('only implemented for `mutant_order` of 1') if latent_or_observed == 'latent': phenoFunc = self.latentPhenotype elif latent_or_observed == 'observed': phenoFunc = self.observedPhenotype else: raise ValueError('invalid value of `latent_or_observed`') phenotypes = [phenoFunc({'aa_substitutions':m}) for m in self.muteffects.keys()] p = (ggplot(pd.DataFrame({'phenotype':phenotypes}), aes('phenotype')) + geom_histogram(bins=bins) + theme(figure_size=(3.5, 2.5)) + ylab(f"number of {mutant_order}-mutants") + xlab(f"{latent_or_observed} phenotype") + geom_vline(xintercept=phenoFunc({'aa_substitutions':''}), color=CBPALETTE[1], linetype='dashed') ) return p
[docs]def tidy_split(df, column, sep=' ', keep=False): """ Split values of a column and expand so new DataFrame has one split value per row. Filters rows where the column is missing. Taken from https://stackoverflow.com/a/39946744 Args: df : pandas DataFrame dataframe with the column to split and expand column : str the column to split and expand sep : str the string used to split the column's values keep : bool whether to retain the presplit value as it's own row Returns: pandas DataFrame Returns a dataframe with the same columns as `df`. """ indexes = list() new_values = list() df = df.dropna(subset=[column]) for i, presplit in enumerate(df[column].astype(str)): values = presplit.split(sep) if keep and len(values) > 1: indexes.append(i) new_values.append(presplit) for value in values: indexes.append(i) new_values.append(value) new_df = df.iloc[indexes, :].copy() new_df[column] = new_values return new_df
[docs]def rarefyBarcodes(barcodecounts, *, barcodecol='barcode', countcol='count', maxpoints=1e5, logspace=True): """Rarefaction curve of barcode observations. Uses analytical formula for rarefaction defined `here <https://en.wikipedia.org/wiki/Rarefaction_(ecology)#Derivation>`_. Args: `barcodecounts` (pandas DataFrame) Has columns with names matching `barcodecol` and `countcol`. `barcodecol` (str) Name of column listing all unique barcodes. `countcol` (str) Name of column with observed counts of each barcode. `maxpoints` (int) Only calculate rarefaction curve at this many points. The benefit is that it can become very costly to calculate the curve for many points. `logspace` (bool) Logarithmically space the `maxpoint` points. If False, space them linearly. Returns: A pandas DataFrame with the columns `ncounts` and `nbarcodes`, giving the number of unique barcodes observed for each total number observed counts. Here is a small example: >>> barcodecounts = pd.DataFrame({'barcode':['A', 'G', 'C', 'T'], ... 'count': [ 4, 2, 1, 0]}) >>> rarefaction_curve = rarefyBarcodes(barcodecounts) >>> rarefaction_curve ncounts nbarcodes 0 1 1.000000 1 2 1.666667 2 3 2.114286 3 4 2.428571 4 5 2.666667 5 6 2.857143 6 7 3.000000 Verify that the result from this example matches what is obtained by random sampling: >>> random.seed(1) >>> barcodelist = [] >>> for tup in barcodecounts.itertuples(index=False): ... barcodelist += [tup.barcode] * tup.count >>> nrand = 10000 >>> ncounts = list(range(1, barcodecounts['count'].sum() + 1)) >>> nbarcodes = [] >>> for ncount in ncounts: ... nbarcodes.append(sum(len(set(random.sample(barcodelist, ncount))) ... for _ in range(nrand)) / nrand) >>> sim_rarefaction_curve = pd.DataFrame(dict(ncounts=ncounts, ... nbarcodes=nbarcodes)) >>> scipy.allclose(rarefaction_curve, sim_rarefaction_curve, atol=1e-2) True """ if len(barcodecounts) != len(barcodecounts[barcodecol].unique()): raise ValueError('non-unique barcodes in `barcodecounts`') # follow nomenclature at # https://en.wikipedia.org/wiki/Rarefaction_(ecology)#Derivation Ni = barcodecounts.set_index(barcodecol)[countcol].to_dict() N = sum(Ni.values()) K = len(barcodecounts) Mj = collections.Counter(Ni.values()) Nk, num = map(scipy.array, zip(*Mj.items())) # use simplification that (N - Ni)Cr(n) / (N)Cr(n) = # [(N - Ni)! * (N - n)!] / [N! * (N - Ni - n)!] # # Also use fact that gamma(x + 1) = x! nbarcodes = [] lnFactorial_N = scipy.special.gammaln(N + 1) if logspace and N > maxpoints: ncounts = list(scipy.unique(scipy.logspace( math.log10(1), math.log10(N), num=min(N, maxpoints)).astype('int'))) else: ncounts = list(scipy.unique(scipy.linspace( 1, N, num=min(N, maxpoints)).astype('int'))) for n in ncounts: lnFactorial_N_minus_n = scipy.special.gammaln(N - n + 1) i = scipy.nonzero(N - Nk - n >= 0) # indices where this is true nbarcodes.append( K - (num[i] * scipy.exp( scipy.special.gammaln(N - Nk[i] + 1) + lnFactorial_N_minus_n - lnFactorial_N - scipy.special.gammaln(N - Nk[i] - n + 1)) ).sum() ) return pd.DataFrame(dict(ncounts=ncounts, nbarcodes=nbarcodes))
[docs]def getCumulVariantsByCount(df, *, group_cols=None, group_cols_as_str=False): """Get number of variants with observed >= each number of times. Args: `df` (pandas DataFrame) A row for each variant, and a column named "count" giving how many times variant observed. Can have other columns. `group_cols` (None or list) Group by these columns and analyze each group separately. `group_cols_as_str` (bool) Explicitly convert any `group_cols` columns to str type. For some reason, this is needed if you are calling in ``R`` using `reticulate <https://rstudio.github.io/reticulate/>`_. Returns: A pandas Data Frame with columns named "count", "nvariants", and "total_variants" (plus any columns in `group_cols`). For each value of "count", "nvariants" gives the number of variants with >= that many counts. The "total_variants" column gives the total number of variants. Here is an example. First, create input Data Frame: >>> df = pd.DataFrame(dict( ... sample= ['a', 'a', 'b', 'b', 'a', 'a'], ... count= [ 9, 0, 1, 4, 3, 3])) Now apply function, first **not** grouping by sample. Note how the sample column is therefore simply ignored and dropped: >>> getCumulVariantsByCount(df) count nvariants total_variants 0 9 1 6 1 4 2 6 2 3 4 6 3 1 5 6 4 0 6 6 Now apply function, grouping by sample. Note how each sample is now analyzed separately: >>> getCumulVariantsByCount(df, group_cols=['sample']) sample count nvariants total_variants 0 a 9 1 4 1 a 3 3 4 2 a 0 4 4 3 b 4 1 2 4 b 1 2 2 """ if 'count' not in df.columns: raise ValueError('df does not have a column named "count"') if not group_cols: drop_group_cols = True group_cols=['dummy_col'] df[group_cols[0]] = '_' else: drop_group_cols = False if isinstance(group_cols, str): group_cols = [group_cols] if not set(group_cols).issubset(set(df.columns)): raise ValueError(f"invalid `group_cols` {group_cols}") df = ( df # get number of variants with each count .assign(nvariants=1) .groupby(group_cols + ['count']) .aggregate({'nvariants':'count'}) .reset_index() .sort_values(group_cols + ['count'], ascending=[True] * len(group_cols) + [False]) .reset_index(drop=True) # make nvariants cumulative number with <= number of counts .assign(nvariants=lambda x: x.groupby(group_cols) ['nvariants'] .cumsum()) # add new column that is total number of variants .assign(total_variants=lambda x: x.groupby(group_cols) ['nvariants'] .transform('max')) ) if drop_group_cols: df = df.drop(group_cols, axis='columns') elif group_cols_as_str: for col in group_cols: df[col] = df[col].astype('str') return df
[docs]def codonSubsToSeq(wildtype, codon_subs, return_aa=False, aa_subs=False): """Convert codon substitutions to sequence. Args: `wildtype` (str) The wildtype sequence `codon_subs` (str) String of space delimited codon substitutions, in the format: OldCodonSiteNewCodon `return_aa` (bool) Specify whether to return sequence as nucleotide or amino acid. Default is nucleotide. `aa_subs` (bool) Specify whether the substitutions are in amino acid form rather than codon. Default is codon. `return_aa` must be True in order for `aa_subs` to be True, since there are numerous possible nucleotide sequences for an amino acid sequence. Returns: A str of the sequence with all the codon substitutions Here is an example: >>> codonSubsToSeq('ATGGAACAA', '') 'ATGGAACAA' >>> codonSubsToSeq('ATGGAACAA', 'GAA2CAG') 'ATGCAGCAA' >>> codonSubsToSeq('ATGGAACAA', 'ATG1GGG GAA2CAG') 'GGGCAGCAA' """ # Make sure you are not trying to convert amino acids to codons if aa_subs: if not return_aa==True: raise ValueError('Cannot return nucleotide sequence using aa subs') # Make sure the wildtype sequence is divisible into codons if len(wildtype) % 3 != 0: raise ValueError('`wildtype` not divisible by 3') codon_list = [wildtype[3 * r : 3 * r + 3] for r in range(len(wildtype) // 3)] # Create parser for the codon substitutions if aa_subs: submatcher = re.compile('^(?P<wt>.)' '(?P<site>\d+)' '(?P<mut>.)$') if not aa_subs: submatcher = re.compile('^(?P<wt>[ATGC]{3})' '(?P<site>\d+)' '(?P<mut>[ATGC]{3})$') for codon in codon_subs.split(): m = submatcher.match(codon) if not m: raise ValueError(f"Invalid codon substitution {codon}") site = int(m.group('site')) - 1 if not (0 <= site < len(codon_list)): raise ValueError('Codon site out of bounds') if not aa_subs: if m.group('wt') != codon_list[site]: raise ValueError(f"Invalid wildtype codon in {codon}") else: if m.group('wt') != CODON_TO_AA[codon_list[site]]: raise ValueError(f"Invalid wildtype aa in {codon}") if m.group('wt') == m.group('mut'): raise ValueError(f"wildtype and mutant the same in {codon}") if not aa_subs: if m.group('mut') not in CODON_TO_AA: raise ValueError(f'Invalid mutant codon in {codon}') else: if m.group('mut') not in AAS_WITHSTOP: raise ValueError(f'Invalid mutant aa in {codon}') # Change to the mutant codon sub = m.group('mut') if aa_subs: sub = AA_TO_CODONS[sub][0] codon_list[site] = sub else: codon_list[site] = m.group('mut') # Return the sequence if not return_aa: return ''.join(codon_list) else: return ''.join(CODON_TO_AA[codon] for codon in codon_list)
[docs]def func_score_to_gpm(func_scores_df, wildtype, metric='func_score'): """Generate a genotype phenotype map from a functional score dataframe. Args: `func_scores_df` (functional score dataframe) A functional score dataframe (from the :meth:`CodonVariantTable.func_scores` method), narrowed down to one post sample condition, typically with something like: `func_scores_df.query('library == @library & sample == @sample')` `wildtype` (str) A string containing the wildtype sequence `metric` (str) A string specifying which metric to use as a phenotype Returns: A genotype phenotype map object from the Harms lab `gpmap package <https://github.com/harmslab/gpmap>`_ to be used as in input into epistasis models in the epistasis package. """ # Put the phenotypes into a list phenotypes = func_scores_df[metric].tolist() # Get standard deviations for each phenotype var = func_scores_df['func_score_var'].tolist() stdev = np.sqrt(var) # Get codon substitutions in a list substitutions = func_scores_df['aa_substitutions'].tolist() # Get a list of genotypes genotypes = [] for subs in substitutions: genotype = codonSubsToSeq(wildtype, subs, return_aa=True, aa_subs=True) genotypes.append(genotype) # Get the wildtype amino acid sequence wildtype = codonSubsToSeq(wildtype, '', return_aa=True) # Generate the genotype phenotype map gpm = gpmap.GenotypePhenotypeMap(wildtype=wildtype, genotypes=genotypes, phenotypes=phenotypes, stdeviations=stdev) return gpm
if __name__ == '__main__': import doctest doctest.testmod()